def main(): args = parse_arguments().parse_args() for matrix in args.matrices: # if hic_ma = hm.hiCMatrix(matrix) size = hic_ma.matrix.shape[0] num_non_zero = hic_ma.matrix.nnz sum_elements = hic_ma.matrix.sum() / 2 bin_length = hic_ma.getBinSize() num_nan_bins = len(hic_ma.nan_bins) min_non_zero = hic_ma.matrix.data.min() max_non_zero = hic_ma.matrix.data.max() chromosomes = list(hic_ma.chrBinBoundaries) if args.outFileName: with open(args.outFileName, 'w') as file: file.write( "# Matrix information file. Created with HiCExplorer's hicInfo version {}\n" .format(__version__)) file.write("File:\t{}\n".format(matrix)) file.write("Size:\t{:,}\n".format(size)) file.write("Sum:\t{:,}\n".format(sum_elements)) file.write("Bin_length:\t{}\n".format(bin_length)) file.write("Chromosomes:\t{}\n".format(", ".join( toString(chromosomes)))) file.write("Non-zero elements:\t{:,}\n".format(num_non_zero)) file.write("Minimum (non zero):\t{}\n".format(min_non_zero)) file.write("Maximum:\t{}\n".format(max_non_zero)) file.write("NaN bins:\t{}\n".format(num_nan_bins)) if check_cooler(matrix): file.write( 'The following columns are available: {}'.format( hic_ma.getInformationCoolerBinNames())) else: print("File:\t{}".format(matrix)) print("Size:\t{:,}".format(size)) print("Sum:\t{:,}".format(sum_elements)) print("Bin_length:\t{}".format(bin_length)) print("Chromosomes:\t{}".format(", ".join(toString(chromosomes)))) print("Non-zero elements:\t{:,}".format(num_non_zero)) print("Minimum (non zero):\t{}".format(min_non_zero)) print("Maximum:\t{}".format(max_non_zero)) print("NaN bins:\t{}".format(num_nan_bins)) if check_cooler(matrix): print('The following columns are available: {}'.format( hic_ma.getInformationCoolerBinNames()))
def main(): args = parse_arguments().parse_args() for matrix in args.matrices: # if hic_ma = hm.hiCMatrix(matrix) size = hic_ma.matrix.shape[0] num_non_zero = hic_ma.matrix.nnz sum_elements = hic_ma.matrix.sum() / 2 bin_length = hic_ma.getBinSize() num_nan_bins = len(hic_ma.nan_bins) min_non_zero = hic_ma.matrix.data.min() max_non_zero = hic_ma.matrix.data.max() chromosomes = list(hic_ma.chrBinBoundaries) print("File:\t{}".format(matrix)) print("Size:\t{:,}".format(size)) print("Sum:\t{:,}".format(sum_elements)) print("Bin_length:\t{}".format(bin_length)) print("Chromosomes:\t{}".format(", ".join(toString(chromosomes)))) print("Non-zero elements:\t{:,}".format(num_non_zero)) print("Minimum (non zero):\t{}".format(min_non_zero)) print("Maximum:\t{}".format(max_non_zero)) print("NaN bins:\t{}".format(num_nan_bins)) if check_cooler(matrix): hic_ma.getInformationCoolerBinNames()
def adjustMatrix(pArgs): if pArgs.chromosomes is not None and pArgs.regions is not None: log.error('Please specify either --chromosomes or --regions.') exit(1) hic_matrix = None if pArgs.chromosomes: if check_cooler(pArgs.matrix) and len(pArgs.chromosomes) == 1 and pArgs.action == 'keep': chromosomes_list = cooler.Cooler(pArgs.matrix).chromnames if pArgs.chromosomes[0] in chromosomes_list: hic_matrix = hm.hiCMatrix(pArgs.matrix, pChrnameList=pArgs.chromosomes) else: log.error('Chromosome not available in matrix: {} {}'.format(pArgs.matrix, pArgs.chromosomes[0])) exit(1) else: hic_matrix = hm.hiCMatrix(pArgs.matrix) chromosomes_list = list(hic_matrix.chrBinBoundaries) chromosomes_list_to_operate_on = [] for chromosome in pArgs.chromosomes: if chromosome in chromosomes_list: chromosomes_list_to_operate_on.append(chromosome) else: log.warning('Chromosome not available in matrix: {} {}'.format(pArgs.matrix, chromosome)) if len(chromosomes_list_to_operate_on) == 0: log.error('No valid chromosome given: {}. Available: {}'.format(pArgs.chromosomes, chromosomes_list)) exit(1) if pArgs.action == 'keep': hic_matrix.reorderChromosomes(chromosomes_list_to_operate_on) elif pArgs.action == 'remove': for chromosome in chromosomes_list: if chromosome in chromosomes_list_to_operate_on: chromosomes_list.remove(chromosome) hic_matrix.reorderChromosomes(chromosomes_list) elif pArgs.action == 'mask': hic_matrix.maskChromosomes(chromosomes_list_to_operate_on) elif pArgs.regions: hic_matrix = hm.hiCMatrix(pArgs.matrix) chromosomes_list = list(hic_matrix.chrBinBoundaries) genomic_regions = [] with open(pArgs.regions, 'r') as file: for line in file.readlines(): _line = line.strip().split('\t') log.debug('_line {}'.format(_line)) if len(line) < 3: log.warning("An entry shorter than 3 columns has been found!") continue if len(_line) >= 3: chrom, start, end = _line[0], int(_line[1]), int(_line[2]) log.debug('chrom {}'.format(chrom)) if chrom in chromosomes_list: genomic_regions.append((chrom, start, end)) else: log.warning('Chromosome not available in matrix, ' 'ignoring regions: {} {}'.format(pArgs.matrix, chrom)) if len(genomic_regions) == 0: log.error('No valid chromosome given. Available: {}'.format(chromosomes_list)) exit(1) matrix_indices_regions = [] for region in genomic_regions: log.debug('region {}'.format(region)) _regionBinRange = hic_matrix.getRegionBinRange(region[0], int(region[1]), int(region[2])) if _regionBinRange is not None: start, end = _regionBinRange matrix_indices_regions.extend(list(range(start, end + 1))) # end is inclusive, so +1 if pArgs.action == 'remove': chr_start, chr_end = hic_matrix.getChrBinRange(chrom) if (start > chr_start) and (end < chr_end - 1): log.warning("{}:{}-{} entry may generate discounted regions on a chromosome." "Please consider using `mask` action to deal with that.".format(chrom, start, end)) if pArgs.action == 'keep': hic_matrix.reorderBins(matrix_indices_regions) elif pArgs.action == 'mask': hic_matrix.maskBins(matrix_indices_regions) elif pArgs.action == 'remove': hic_matrix.maskBins(matrix_indices_regions) hic_matrix.orig_bin_ids = [] hic_matrix.orig_cut_intervals = [] hic_matrix.nan_bins = [] elif pArgs.maskBadRegions: if check_cooler(pArgs.matrix) and len(pArgs.chromosomes) == 1 and pArgs.action == 'keep': hic_matrix = hm.hiCMatrix(pArgs.matrix, pChrnameList=pArgs.chromosomes) else: hic_matrix = hm.hiCMatrix(pArgs.matrix) else: log.info('No data to adjust given. Please specify either --chromosomes or --region parameter.') return hic_matrix
def main(args=None): args = parse_arguments().parse_args(args) if args.chromosomes is not None and args.regions is not None: log.error('Please specify either --chromosomes or --regions.') exit(1) hic_ma = None if args.chromosomes: if check_cooler(args.matrix) and len( args.chromosomes) == 1 and args.action == 'keep': hic_ma = hm.hiCMatrix(args.matrix, pChrnameList=args.chromosomes) else: hic_ma = hm.hiCMatrix(args.matrix) if args.action == 'keep': hic_ma.reorderChromosomes(args.chromosomes) elif args.action == 'remove': chromosomes = list(hic_ma.chrBinBoundaries) for chromosome in args.chromosomes: if chromosome in chromosomes: chromosomes.remove(chromosome) hic_ma.reorderChromosomes(chromosomes) elif args.action == 'mask': hic_ma.maskChromosomes(args.chromosomes) elif args.regions: hic_ma = hm.hiCMatrix(args.matrix) genomic_regions = [] with open(args.regions, 'r') as file: for line in file.readlines(): _line = line.strip().split('\t') if len(line) == 0: continue if len(_line) == 3: chrom, start, end = _line[0], _line[1], int(_line[2]) - 1 genomic_regions.append((chrom, start, end)) # log.debug('genomic_regions {}'.format(genomic_regions)) matrix_indices_regions = [] for region in genomic_regions: _regionBinRange = hic_ma.getRegionBinRange(region[0], region[1], region[2]) if _regionBinRange is not None: start, end = _regionBinRange matrix_indices_regions.extend(list(range(start, end))) # log.debug('matrix_indices_regions {}'.format(matrix_indices_regions)) if args.action == 'keep': hic_ma.reorderBins(matrix_indices_regions) elif args.action == 'mask': hic_ma.maskBins(matrix_indices_regions) elif args.action == 'remove': full_matrix_range = np.array( range(0, max(hic_ma.matrix.shape[0], hic_ma.matrix.shape[1]))) matrix_indices_regions = np.array(matrix_indices_regions) full_matrix_range[matrix_indices_regions] = -1 mask = full_matrix_range != -1 full_matrix_range = full_matrix_range[mask] hic_ma.reorderBins(full_matrix_range) else: log.info( 'No data to adjust given. Please specify either --chromosomes or --region parameter.' ) if hic_ma is not None: hic_ma.save(args.outFileName)
def main(args=None): args = parse_arguments().parse_args(args) short_v_long_range = [] sum_smaller = [] sum_greater = [] for matrix in args.matrices: is_cooler = check_cooler(matrix) if not is_cooler: hic_matrix = hm.hiCMatrix(matrix) else: hic_matrix = matrix if args.chromosomes is None: # get all chromosomes from cooler file if not is_cooler: chromosomes_list = list(hic_matrix.chrBinBoundaries) else: chromosomes_list = cooler.Cooler(matrix).chromnames else: chromosomes_list = args.chromosomes short_v_long_range_matrix_threads = [None] * args.threads sum_smaller_threads = [None] * args.threads sum_greater_threads = [None] * args.threads chromosomesListPerThread = len(chromosomes_list) // args.threads all_data_collected = False queue = [None] * args.threads process = [None] * args.threads thread_done = [False] * args.threads for i in range(args.threads): if i < args.threads - 1: chromosomeListThread = chromosomes_list[ i * chromosomesListPerThread:(i + 1) * chromosomesListPerThread] else: chromosomeListThread = chromosomes_list[ i * chromosomesListPerThread:] queue[i] = Queue() process[i] = Process(target=compute_relation_short_long_range, kwargs=dict(pHiCMatrix=hic_matrix, pChromosomes=chromosomeListThread, pDistance=args.distance, pIsCooler=is_cooler, pQueue=queue[i])) process[i].start() while not all_data_collected: for i in range(args.threads): if queue[i] is not None and not queue[i].empty(): short_v_long_range_matrix_threads[i], sum_smaller_threads[ i], sum_greater_threads[i] = queue[i].get() queue[i] = None process[i].join() process[i].terminate() process[i] = None thread_done[i] = True all_data_collected = True for thread in thread_done: if not thread: all_data_collected = False time.sleep(1) short_v_long_range_matrix = [ item for sublist in short_v_long_range_matrix_threads for item in sublist ] sum_smaller_matrix = [ item for sublist in sum_smaller_threads for item in sublist ] sum_greater_matrix = [ item for sublist in sum_greater_threads for item in sublist ] short_v_long_range.append(short_v_long_range_matrix) sum_smaller.append(sum_smaller_matrix) sum_greater.append(sum_greater_matrix) log.debug(short_v_long_range) plt.ylabel('Sum short range / long range') plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False) box_plot = plt.boxplot(short_v_long_range, patch_artist=True) legend_handels_color = [] for i, patch in enumerate(box_plot['boxes']): patch.set_facecolor(args.colorList[i % len(args.colorList)]) legend_handels_color.append( mpatches.Patch(color=args.colorList[i % len(args.colorList)], label=args.matrices[i].split('/')[-1])) plt.legend(handles=legend_handels_color) plt.xlabel('Boxplot shows svl-ratio per chromosome.') plt.savefig(args.plotFileName, dpi=args.dpi) if len(args.matrices) > 1: p_values = [] for i, sample in enumerate(short_v_long_range): for sample2 in short_v_long_range[i + 1:]: statistic, significance_level = ranksums(sample, sample2) p_values.append(significance_level) log.debug('p_values {}'.format(p_values)) with open(args.outFileName, 'w') as file: header = '# Created with HiCExplorer\'s hicPlotSVL ' + __version__ + '\n' header += "# Short range vs long range contacts per chromosome, p-values of each distribution against each other distribution with Wilcoxon rank-sum\n" header += '# Short range contacts: <= ' + str(args.distance) + '\n' file.write(header) counter = 0 for i, matrix_0 in enumerate(args.matrices): for j, matrix_1 in enumerate(args.matrices[i + 1:]): file.write(matrix_0 + '\t' + matrix_1 + '\t' + str(p_values[counter]) + '\n') counter += 1 with open(args.outFileNameData, 'w') as file: header = '# Created with HiCExplorer\'s hicPlotSVL ' + __version__ + '\n' header += "# Short range vs long range contacts per chromosome: raw data\n" header += '# Short range contacts: <= ' + str(args.distance) + '\n' matrices_names = '\t\t\t'.join(args.matrices) header += '#\t{}\n'.format(matrices_names) header += '# Chromosome\t' header += '\t'.join([ 'Ratio', 'Sum <= {}'.format(args.distance), 'Sum > {}'.format( args.distance) ] * len(args.matrices)) header += '\n' file.write(header) counter = 0 for i, chromosome in enumerate(chromosomes_list): file.write('{}\t'.format(chromosome)) for j, matrix in enumerate(args.matrices): if i < len(short_v_long_range[j]): file.write('{}\t{}\t{}\t'.format(short_v_long_range[j][i], sum_smaller[j][i], sum_greater[j][i])) else: file.write('\t') file.write('\n')
def main(args=None): args = parse_arguments().parse_args(args) for matrix in args.matrices: # if generated_by = None genome_assembly = None statistics = None generated_by_cooler_lib = None tool_url = None matrix_generated_by = None matrix_generated_by_url = None creation_date = None bin_length = None size = None nchroms = None num_non_zero = None min_non_zero = None max_non_zero = None sum_elements = None num_nan_bins = None if check_cooler(matrix) and args.no_metadata: cooler_file = cooler.Cooler(matrix) if cooler_file.info is not None: # log.debug('cooler_file.info {}'.format(cooler_file.info)) if 'bin-size' in cooler_file.info: bin_length = cooler_file.info['bin-size'] if 'nbins' in cooler_file.info: size = cooler_file.info['nbins'] if 'nchroms' in cooler_file.info: nchroms = cooler_file.info['nchroms'] # if 'chromosomes' in cooler_file.info: # chromosomes = cooler_file.info['chromosomes'] if 'nnz' in cooler_file.info: num_non_zero = cooler_file.info['nnz'] if 'min-value' in cooler_file.info: min_non_zero = cooler_file.info['min-value'] if 'max-value' in cooler_file.info: max_non_zero = cooler_file.info['max-value'] if 'generated-by' in cooler_file.info: generated_by = toString(cooler_file.info['generated-by']) if 'genome-assembly' in cooler_file.info: genome_assembly = toString( cooler_file.info['genome-assembly']) if 'metadata' in cooler_file.info: if cooler_file.info['metadata'] is not None: if 'statistics' in cooler_file.info['metadata']: statistics = cooler_file.info['metadata']['statistics'] if 'generated-by-cooler-lib' in cooler_file.info: generated_by_cooler_lib = toString( cooler_file.info['generated-by-cooler-lib']) if 'tool-url' in cooler_file.info: tool_url = toString(cooler_file.info['tool-url']) if 'matrix-generated-by' in cooler_file.info: matrix_generated_by = toString( cooler_file.info['matrix-generated-by']) if 'matrix-generated-by-url' in cooler_file.info: matrix_generated_by_url = toString( cooler_file.info['matrix-generated-by-url']) if 'creation-date' in cooler_file.info: creation_date = cooler_file.info['creation-date'] if 'sum-elements' in cooler_file.info: sum_elements = cooler_file.info['sum-elements'] chromosome_sizes = cooler_file.chromsizes else: hic_ma = hm.hiCMatrix(matrix) size = hic_ma.matrix.shape[0] num_non_zero = hic_ma.matrix.nnz sum_elements = ((hic_ma.matrix.sum() - hic_ma.matrix.diagonal().sum()) / 2) + hic_ma.matrix.diagonal().sum() bin_length = hic_ma.getBinSize() num_nan_bins = len(hic_ma.nan_bins) min_non_zero = hic_ma.matrix.data.min() max_non_zero = hic_ma.matrix.data.max() # chromosomes = list(hic_ma.chrBinBoundaries) chromosome_sizes = hic_ma.get_chromosome_sizes() information = StringIO() information.write( "# Matrix information file. Created with HiCExplorer's hicInfo version {}\n".format(__version__)) if matrix is not None: information.write("File:\t{}\n".format(matrix)) if creation_date is not None: information.write("Date:\t{}\n".format(creation_date)) if genome_assembly is not None: information.write("Genome assembly:\t{}\n".format(genome_assembly)) if size is not None: information.write("Size:\t{:,}\n".format(size)) if bin_length is not None: information.write("Bin_length:\t{}\n".format(bin_length)) if sum_elements is not None: information.write("Sum of matrix:\t{}\n".format(sum_elements)) # if chromosomes is not None: # information.write("Chromosomes:\t{}\n".format( # ", ".join(toString(chromosomes)))) information.write("Chromosomes:length: ") for key, value in chromosome_sizes.items(): information.write("{}: {} bp; ".format(key, value)) information.write('\n') if nchroms is not None: information.write("Number of chromosomes:\t{}\n".format(nchroms)) if num_non_zero is not None: information.write( "Non-zero elements:\t{:,}\n".format(num_non_zero)) if min_non_zero is not None: information.write("Minimum (non zero):\t{}\n".format(min_non_zero)) if max_non_zero is not None: information.write("Maximum:\t{}\n".format(max_non_zero)) if num_nan_bins is not None: information.write("NaN bins:\t{}\n".format(num_nan_bins)) if check_cooler(matrix): information.write('The following columns are available: {}\n'.format( cooler.Cooler(matrix).bins().columns.values)) if generated_by is not None: information.write("\n\nGenerated by:\t{}\n".format(generated_by)) if generated_by_cooler_lib is not None: information.write("Cooler library version:\t{}\n".format( generated_by_cooler_lib)) if tool_url is not None: information.write("HiCMatrix url:\t{}\n".format(tool_url)) if matrix_generated_by is not None: information.write( "Interaction matrix created with:\t{}\n".format(matrix_generated_by)) if matrix_generated_by_url is not None: information.write("URL:\t{}\n".format(matrix_generated_by_url)) if statistics is not None: information.write("\n\nBuild statistics:\n{}\n".format(statistics)) if args.outFileName: with open(args.outFileName, 'w') as file: file.write(information.getvalue()) else: print(information.getvalue()) information.close()
def adjustMatrix(pArgs): if pArgs.chromosomes is not None and pArgs.regions is not None: log.error('Please specify either --chromosomes or --regions.') exit(1) hic_matrix = None if pArgs.chromosomes: if check_cooler(pArgs.matrix) and len(pArgs.chromosomes) == 1 and pArgs.action == 'keep': chromosomes_list = cooler.Cooler(pArgs.matrix).chromnames if pArgs.chromosomes[0] in chromosomes_list: hic_matrix = hm.hiCMatrix(pArgs.matrix, pChrnameList=pArgs.chromosomes) else: log.error('Chromosome not available in matrix: {} {}'.format(pArgs.matrix, pArgs.chromosomes[0])) exit(1) else: hic_matrix = hm.hiCMatrix(pArgs.matrix) chromosomes_list = list(hic_matrix.chrBinBoundaries) chromosomes_list_to_operate_on = [] for chromosome in pArgs.chromosomes: if chromosome in chromosomes_list: chromosomes_list_to_operate_on.append(chromosome) else: log.warning('Chromosome not available in matrix: {} {}'.format(pArgs.matrix, chromosome)) if len(chromosomes_list_to_operate_on) == 0: log.error('No valid chromosome given: {}. Available: {}'.format(pArgs.chromosomes, chromosomes_list)) exit(1) if pArgs.action == 'keep': hic_matrix.reorderChromosomes(chromosomes_list_to_operate_on) elif pArgs.action == 'remove': # chromosomes = list(hic_matrix.chrBinBoundaries) for chromosome in chromosomes_list: if chromosome in chromosomes_list_to_operate_on: chromosomes_list.remove(chromosome) hic_matrix.reorderChromosomes(chromosomes_list) elif pArgs.action == 'mask': hic_matrix.maskChromosomes(chromosomes_list_to_operate_on) elif pArgs.regions: hic_matrix = hm.hiCMatrix(pArgs.matrix) chromosomes_list = list(hic_matrix.chrBinBoundaries) genomic_regions = [] with open(pArgs.regions, 'r') as file: for line in file.readlines(): _line = line.strip().split('\t') if len(line) < 3: log.warning("An entry shorter than 3 columns has been found!") continue if len(_line) >= 3: chrom, start, end = _line[0], int(_line[1]), int(_line[2]) if chrom in chromosomes_list: genomic_regions.append((chrom, start, end)) else: log.warning('Chromosome not available in matrix, ignoring regions: {} {}'.format(pArgs.matrix, chrom)) if len(genomic_regions) == 0: log.error('No valid chromosome given. Available: {}'.format(chromosomes_list)) exit(1) # log.debug('genomic_regions {}'.format(genomic_regions)) matrix_indices_regions = [] for region in genomic_regions: _regionBinRange = hic_matrix.getRegionBinRange(region[0], region[1], region[2]) if _regionBinRange is not None: start, end = _regionBinRange matrix_indices_regions.extend(list(range(start, end))) # log.debug('matrix_indices_regions {}'.format(matrix_indices_regions)) if pArgs.action == 'keep': values_submatrix = matrix_indices_regions instances, features = hic_matrix.matrix.nonzero() mask = np.isin(instances, values_submatrix) mask = np.logical_not(mask) hic_matrix.matrix.data[mask] = 0 hic_matrix.matrix.eliminate_zeros() elif pArgs.action == 'mask': hic_matrix.maskBins(matrix_indices_regions) elif pArgs.action == 'remove': full_matrix_range = np.array(range(0, max(hic_matrix.matrix.shape[0], hic_matrix.matrix.shape[1]))) matrix_indices_regions = np.array(matrix_indices_regions) full_matrix_range[matrix_indices_regions] = -1 mask = full_matrix_range != -1 full_matrix_range = full_matrix_range[mask] hic_matrix.reorderBins(full_matrix_range) elif pArgs.maskBadRegions: if check_cooler(pArgs.matrix) and len(pArgs.chromosomes) == 1 and pArgs.action == 'keep': hic_matrix = hm.hiCMatrix(pArgs.matrix, pChrnameList=pArgs.chromosomes) else: hic_matrix = hm.hiCMatrix(pArgs.matrix) else: log.info('No data to adjust given. Please specify either --chromosomes or --region parameter.') return hic_matrix