def main(): """ Execute the program from the command line Args are: python hic2cool.py <infile (.hic)> <outfile (.cool)> <resolutions desired (defaults to all, optionally bp int)> <normalization type (defaults to 'KR', optionally 'NONE', 'VC', or 'VC_SQRT')> <exclude MT (default False)> """ parser = argparse.ArgumentParser() parser.add_argument("infile", help=".hic input file") parser.add_argument("outfile", help=".cool output file") parser.add_argument( "-r", "--resolution", help= "integer bp resolution desired in cooler file. Setting to 0 (default) will use all resolutions. If all resolutions are used, a multi-res .cool file will be created, which has a different hdf5 structure. See the README for more info", type=int, default=0) parser.add_argument( "-n", "--normalization", help= "string normalization type. Defaults to KR, optionally NONE, VC, or VC_SQRT", choices=['KR', 'NONE', 'VC', 'VC_SQRT'], default='KR') parser.add_argument( "-e", "--exclude_MT", help="if used, exclude the mitochondria (MT) from the output", action="store_true") args = parser.parse_args() # these parameters adapted from theaidenlab/straw # KR is default normalization type and BP is the unit for binsize hic2cool_convert(args.infile, args.outfile, args.resolution, args.normalization, args.exclude_MT, True)
def test_2_run_exclude_missings_2500000(self): with captured_output() as (out, err): hic2cool_convert(self.infile_name, self.outfile_name2, self.binsize2) read_err = err.getvalue().strip() self.assertFalse('WARNING' in read_err) self.assertTrue(os.path.isfile(self.outfile_name2))
def test_run_MT_error(self): with captured_output() as (out, err): # this should fail, because test file is missing chrMT # and excludeMT was not specified with self.assertRaises(SystemExit): hic2cool_convert(self.infile_name, self.outfile_name, self.binsize, self.normalization) read_err = err.getvalue().strip() self.assertTrue('ERROR' in read_err)
def test_run_exclude_MT_multi_res(self): # run hic2cool for all resolutions in the hic file with captured_output() as (out, err): # this should fail, because test file is missing chrMT # and excludeMT was not specified hic2cool_convert(self.infile_name, self.outfile_name_all, 0, self.normalization, True) read_err = err.getvalue().strip() self.assertFalse('ERROR' in read_err) self.assertTrue(os.path.isfile(self.outfile_name_all))
def test_3_run_exclude_missing_multi_res_no_norms(self): # run hic2cool for all resolutions in the hic file with captured_output() as (out, err): # this should fail, because test file is missing chrMT # and excludeMT was not specified hic2cool_convert(self.infile_no_norms, self.outfile_no_norms, 0) read_err = err.getvalue().strip() self.assertTrue('WARNING. No normalization vectors' in read_err) self.assertTrue(os.path.isfile(self.outfile_no_norms))
def test_run_exclude_MT_1000000(self): with captured_output() as (out, err): hic2cool_convert(self.infile_name, self.outfile_name2, self.binsize2, self.normalization, True) read_err = err.getvalue().strip() self.assertFalse('ERROR' in read_err) self.assertTrue(os.path.isfile(self.outfile_name2))
def main(args=None): args = parse_arguments().parse_args(args) log.debug(args) # parse from hicpro, homer, h5 and hic to cool if args.inputFormat != 'hic' and args.outputFormat != 'mcool': if len(args.matrices) != len(args.outFileName): log.error( 'Number of input matrices does not match number output matrices!: Input matrices {}; output matrices {}' .format(len(args.matrices), len(args.outFileName))) exit(1) if args.inputFormat == 'hic' and args.outputFormat != 'cool': log.error('The export of a hic file is only possible to a cool file.') exit(1) if args.inputFormat == 'hic' and args.outputFormat == 'cool': log.info('Converting with hic2cool.') for i, matrix in enumerate(args.matrices): if args.resolutions is None: hic2cool_convert(matrix, args.outFileName[i], 0) else: for resolution in args.resolutions: out_name = args.outFileName[i].split('.') out_name[-2] = out_name[-2] + '_' + str(resolution) out_name = '.'.join(out_name) hic2cool_convert(matrix, out_name, resolution) return elif args.inputFormat in ['hicpro', 'homer', 'h5', 'cool', '2D-text']: format_was_h5 = False if args.inputFormat == 'h5': format_was_h5 = True applyCorrection = True if args.store_applied_correction: applyCorrection = False if args.inputFormat == 'hicpro': if len(args.matrices) != len(args.bedFileHicpro): log.error( 'Number of matrices and associated bed files need to be the same.' ) log.error('Matrices: {}; Bed files: {}'.format( len(args.matrices), len(args.bedFileHicpro))) sys.exit(1) if args.inputFormat == '2D-text': if args.resolutions is None: log.error('The resolution must be defined via --resolutions') sys.exit(1) if args.chromosomeSizes is None: log.error( 'The sizes of the chromosomes must be defined via --chromosomeSizes.' ) sys.exit(1) for i, matrix in enumerate(args.matrices): if args.inputFormat == 'hicpro': matrixFileHandlerInput = MatrixFileHandler( pFileType=args.inputFormat, pMatrixFile=matrix, pBedFileHicPro=args.bedFileHicpro[i]) _matrix, cut_intervals, nan_bins, \ distance_counts, correction_factors = matrixFileHandlerInput.load() elif args.inputFormat == '2D-text': chrom_sizes = OrderedDict() size_genome = 0 with open(args.chromosomeSizes.name, 'r') as file: file_ = True while file_: file_ = file.readline().strip() if file_ != '': line_split = file_.split('\t') chrom_sizes[line_split[0]] = int(line_split[1]) size_genome += int(line_split[1]) chrom_sizes = list(chrom_sizes.items()) # log.debug('chrom_sizes: {}'.format(chrom_sizes)) args.resolutions = [int(x) for x in args.resolutions] # internal_matrix_size = size_genome // args.resolutions[0] cut_intervals = [] for chromosome in chrom_sizes: for interval in range(0, chromosome[1], args.resolutions[0]): cut_intervals.append( tuple([ chromosome[0], interval, min(chromosome[1], interval + args.resolutions[0]), 1.0 ])) hic_matrix_csr = lil_matrix( (len(cut_intervals), len(cut_intervals))) log.debug('cut_intervals {}'.format(cut_intervals[:20])) hic_matrix = HiCMatrix.hiCMatrix() hic_matrix.setMatrix(hic_matrix_csr, cut_intervals) # tmp_matrix = coo_matrix(()) with open(matrix, 'r') as file: for j, line in enumerate(file): line_split = line.split('\t') chromosome_1 = str(line_split[0]) start_1 = int(line_split[1]) end_1 = int(line_split[2]) chromosome_2 = str(line_split[3]) start_2 = int(line_split[4]) end_2 = int(line_split[5]) value = float(line_split[6]) bin_id_1 = hic_matrix.getRegionBinRange( chromosome_1, start_1, end_1) bin_id_2 = hic_matrix.getRegionBinRange( chromosome_2, start_2, end_2) try: hic_matrix.matrix[bin_id_1, bin_id_2] = value except Exception as exp: log.debug(str(exp)) if j % 1000 == 0: log.debug('{} lines computed'.format(j)) log.debug('csr with values filled!') hic_matrix.matrix = hic_matrix.matrix.tocsr() _matrix, cut_intervals, nan_bins, \ distance_counts, correction_factors = hic_matrix.matrix, hic_matrix.cut_intervals, hic_matrix.nan_bins, \ hic_matrix.distance_counts, hic_matrix.correction_factors else: correction_operator = None if args.correction_division: correction_operator = '/' chromosomes_to_load = None if args.chromosome: chromosomes_to_load = [args.chromosome] applyCorrectionCoolerLoad = True if args.load_raw_values: applyCorrectionCoolerLoad = False matrixFileHandlerInput = MatrixFileHandler( pFileType=args.inputFormat, pMatrixFile=matrix, pCorrectionFactorTable=args.correction_name, pCorrectionOperator=correction_operator, pChrnameList=chromosomes_to_load, pEnforceInteger=args.enforce_integer, pApplyCorrectionCoolerLoad=applyCorrectionCoolerLoad) _matrix, cut_intervals, nan_bins, \ distance_counts, correction_factors = matrixFileHandlerInput.load() log.debug('cut_intervals {}'.format(cut_intervals[:20])) log.debug('Setting done') if args.outputFormat in ['cool', 'h5', 'homer', 'ginteractions']: log.debug('cool h5 homer ginteractions hicpro branch') if args.outputFormat in ['homer', 'ginteractions']: log.debug('homer ginteractions branch') # make it a upper triangular matrix in case it is not already _matrix = triu(_matrix) # make it a full symmetrical matrix _matrix = _matrix.maximum(_matrix.T) hic2CoolVersion = None cool_metadata = None if args.inputFormat == 'cool': hic2CoolVersion = matrixFileHandlerInput.matrixFile.hic2cool_version cool_metadata = matrixFileHandlerInput.matrixFile.hic_metadata log.debug('cool_metadata {}'.format(cool_metadata)) matrixFileHandlerOutput = MatrixFileHandler( pFileType=args.outputFormat, pEnforceInteger=args.enforce_integer, pFileWasH5=format_was_h5, pHic2CoolVersion=hic2CoolVersion, pHiCInfo=cool_metadata) matrixFileHandlerOutput.set_matrix_variables( _matrix, cut_intervals, nan_bins, correction_factors, distance_counts) log.debug('len(args.outFileName) {}, i {}'.format( len(args.outFileName), i)) matrixFileHandlerOutput.save(args.outFileName[i], pSymmetric=True, pApplyCorrection=applyCorrection) if args.outputFormat == 'hicpro': log.debug('hicpro branch') if len(args.matrices) == len(args.outFileName) and len( args.outFileName) == len(args.bedFileHicpro): log.debug('args.bedFileHicpro[i] {}'.format( args.bedFileHicpro[i])) matrixFileHandlerOutput = MatrixFileHandler( pFileType=args.outputFormat, pBedFileHicPro=args.bedFileHicpro[i]) matrixFileHandlerOutput.set_matrix_variables( _matrix, cut_intervals, nan_bins, correction_factors, distance_counts) matrixFileHandlerOutput.save( args.outFileName[i], pSymmetric=True, pApplyCorrection=applyCorrection) else: log.error( 'The number of input matrices, output files and bed files does not match: Input: {}; Output: {}; Bed: {}' .format(len(args.matrix), len(args.outFileName), len(args.bedFileHicpro))) exit(1) elif args.outputFormat in ['mcool']: log.debug('outformat is mcool') if args.resolutions and len(args.matrices) > 1: log.error( 'Please define one matrix and many resolutions which should be created or multiple matrices.' ) if args.resolutions: log.info( 'Correction factors are removed. They are not valid for any new created resolution.' ) hic_matrix = HiCMatrix.hiCMatrix() hic_matrix.setMatrix(_matrix, cut_intervals) bin_size = hic_matrix.getBinSize() hic2CoolVersion = None cool_metadata = None if args.inputFormat == 'cool': hic2CoolVersion = matrixFileHandlerInput.matrixFile.hic2cool_version cool_metadata = matrixFileHandlerInput.matrixFile.hic_metadata for j, resolution in enumerate(args.resolutions): hic_matrix_res = deepcopy(hic_matrix) _mergeFactor = int(resolution) // bin_size log.debug('bin size {}'.format(bin_size)) log.debug('_mergeFactor {}'.format(_mergeFactor)) if int(resolution) != bin_size: merged_matrix = hicMergeMatrixBins.merge_bins( hic_matrix_res, _mergeFactor) else: merged_matrix = hic_matrix_res append = False if j > 0: append = True matrixFileHandlerOutput = MatrixFileHandler( pFileType='cool', pEnforceInteger=args.enforce_integer, pAppend=append, pFileWasH5=format_was_h5, pHic2CoolVersion=hic2CoolVersion, pHiCInfo=cool_metadata) matrixFileHandlerOutput.set_matrix_variables( merged_matrix.matrix, merged_matrix.cut_intervals, merged_matrix.nan_bins, merged_matrix.correction_factors, merged_matrix.distance_counts) matrixFileHandlerOutput.save( args.outFileName[0] + '::/resolutions/' + str(resolution), pSymmetric=True, pApplyCorrection=applyCorrection) else: append = False if i > 0: append = True hic_matrix = HiCMatrix.hiCMatrix() hic_matrix.setMatrix(_matrix, cut_intervals) bin_size = hic_matrix.getBinSize() matrixFileHandlerOutput = MatrixFileHandler( pFileType='cool', pAppend=append, pFileWasH5=format_was_h5) matrixFileHandlerOutput.set_matrix_variables( _matrix, cut_intervals, nan_bins, correction_factors, distance_counts) matrixFileHandlerOutput.save( args.outFileName[0] + '::/resolutions/' + str(bin_size), pSymmetric=True, pApplyCorrection=applyCorrection)
def main(args=None): args = parse_arguments().parse_args(args) log.debug(args) # parse from hicpro, homer, h5 and hic to cool if args.inputFormat != 'hic' and args.outputFormat != 'mcool': if len(args.matrices) != len(args.outFileName): log.error( 'Number of input matrices does not match number output matrices!' ) exit(1) if args.inputFormat == 'hic' and args.outputFormat == 'cool': log.info('Converting with hic2cool.') for i, matrix in enumerate(args.matrices): if args.resolutions is None: hic2cool_convert(matrix, args.outFileName[i], 0) else: for resolution in args.resolutions: out_name = args.outFileName[i].split('.') out_name[-2] = out_name[-2] + '_' + str(resolution) out_name = '.'.join(out_name) hic2cool_convert(matrix, out_name, resolution) return elif args.inputFormat in ['hicpro', 'homer', 'h5', 'cool']: format_was_h5 = False if args.inputFormat == 'h5': format_was_h5 = True applyCorrection = True if args.store_applied_correction: applyCorrection = False if args.inputFormat == 'hicpro': if len(args.matrices) != len(args.bedFileHicpro): log.error( 'Number of matrices and associated bed files need to be the same.' ) log.error('Matrices: {}; Bed files: {}'.format( len(args.matrices), len(args.bedFileHicpro))) sys.exit(1) for i, matrix in enumerate(args.matrices): if args.inputFormat == 'hicpro': matrixFileHandlerInput = MatrixFileHandler( pFileType=args.inputFormat, pMatrixFile=matrix, pBedFileHicPro=args.bedFileHicpro[i]) else: correction_operator = None if args.correction_division: correction_operator = '/' chromosomes_to_load = None if args.chromosome: chromosomes_to_load = [args.chromosome] applyCorrectionCoolerLoad = True if args.load_raw_values: applyCorrectionCoolerLoad = False matrixFileHandlerInput = MatrixFileHandler( pFileType=args.inputFormat, pMatrixFile=matrix, pCorrectionFactorTable=args.correction_name, pCorrectionOperator=correction_operator, pChrnameList=chromosomes_to_load, pEnforceInteger=args.enforce_integer, pApplyCorrectionCoolerLoad=applyCorrectionCoolerLoad) _matrix, cut_intervals, nan_bins, \ distance_counts, correction_factors = matrixFileHandlerInput.load() log.debug('Setting done') if args.outputFormat in ['cool', 'h5', 'homer', 'ginteractions']: if args.outputFormat in ['homer', 'ginteractions']: # make it a upper triangular matrix in case it is not already _matrix = triu(_matrix) # make it a full symmetrical matrix _matrix = _matrix.maximum(_matrix.T) hic2CoolVersion = None if args.inputFormat == 'cool': hic2CoolVersion = matrixFileHandlerInput.matrixFile.hic2cool_version matrixFileHandlerOutput = MatrixFileHandler( pFileType=args.outputFormat, pEnforceInteger=args.enforce_integer, pFileWasH5=format_was_h5, pHic2CoolVersion=hic2CoolVersion) matrixFileHandlerOutput.set_matrix_variables( _matrix, cut_intervals, nan_bins, correction_factors, distance_counts) matrixFileHandlerOutput.save(args.outFileName[i], pSymmetric=True, pApplyCorrection=applyCorrection) elif args.outputFormat in ['mcool']: log.debug('outformat is mcool') if args.resolutions and len(args.matrices) > 1: log.error( 'Please define one matrix and many resolutions which should be created or multiple matrices.' ) if args.resolutions: log.info( 'Correction factors are removed. They are not valid for any new created resolution.' ) hic_matrix = HiCMatrix.hiCMatrix() hic_matrix.setMatrix(_matrix, cut_intervals) bin_size = hic_matrix.getBinSize() for j, resolution in enumerate(args.resolutions): hic_matrix_res = deepcopy(hic_matrix) _mergeFactor = int(resolution) // bin_size log.debug('bin size {}'.format(bin_size)) log.debug('_mergeFactor {}'.format(_mergeFactor)) if int(resolution) != bin_size: merged_matrix = hicMergeMatrixBins.merge_bins( hic_matrix_res, _mergeFactor) else: merged_matrix = hic_matrix_res append = False if j > 0: append = True matrixFileHandlerOutput = MatrixFileHandler( pFileType='cool', pEnforceInteger=args.enforce_integer, pAppend=append, pFileWasH5=format_was_h5) matrixFileHandlerOutput.set_matrix_variables( merged_matrix.matrix, merged_matrix.cut_intervals, merged_matrix.nan_bins, merged_matrix.correction_factors, merged_matrix.distance_counts) matrixFileHandlerOutput.save( args.outFileName[0] + '::/resolutions/' + str(resolution), pSymmetric=True, pApplyCorrection=applyCorrection) else: append = False if i > 0: append = True hic_matrix = HiCMatrix.hiCMatrix() hic_matrix.setMatrix(_matrix, cut_intervals) bin_size = hic_matrix.getBinSize() matrixFileHandlerOutput = MatrixFileHandler( pFileType='cool', pAppend=append, pFileWasH5=format_was_h5) matrixFileHandlerOutput.set_matrix_variables( _matrix, cut_intervals, nan_bins, correction_factors, distance_counts) matrixFileHandlerOutput.save( args.outFileName[0] + '::/resolutions/' + str(bin_size), pSymmetric=True, pApplyCorrection=applyCorrection)
def main(args=None): args = parse_arguments().parse_args(args) log.debug(args) # parse from hicpro, homer, h5 and hic to cool if args.inputFormat != 'hic' and args.outputFormat != 'mcool': if len(args.matrices) != len(args.outFileName): log.error( 'Number of input matrices does not match number output matrices!' ) exit(1) if args.inputFormat == 'hic' and args.outputFormat == 'cool': log.info('Converting with hic2cool.') for i, matrix in enumerate(args.matrices): if args.resolutions is None: hic2cool_convert(matrix, args.outFileName[i], 0) else: out_name = args.outFileName[i].split('.') out_name[-2] = split_name[-2] + '_' + str(resolution) out_name = '.'.join(out_name) for resolution in args.resolutions: hic2cool_convert(matrix, out_name, resolution) return elif args.inputFormat in ['hicpro', 'homer', 'h5', 'cool']: applyCorrection = True if args.store_applied_correction: applyCorrection = False if args.inputFormat == 'hicpro': if len(args.matrices) != len(args.bedFileHicpro): log.error( 'Number of matrices and associated bed files need to be the same.' ) log.error('Matrices: {}; Bed files: {}'.format( len(args.matrices), len(args.bedFileHicpro))) sys.exit(1) for i, matrix in enumerate(args.matrices): if args.inputFormat == 'hicpro': matrixFileHandlerInput = MatrixFileHandler( pFileType=args.inputFormat, pMatrixFile=matrix, pBedFileHicPro=args.bedFileHicpro[i]) else: correction_operator = None if args.correction_division: correction_operator = '/' chromosomes_to_load = None if args.chromosome: chromosomes_to_load = [args.chromosome] matrixFileHandlerInput = MatrixFileHandler( pFileType=args.inputFormat, pMatrixFile=matrix, pCorrectionFactorTable=args.correction_name, pCorrectionOperator=correction_operator, pChrnameList=chromosomes_to_load, pEnforceInteger=args.enforce_integer) _matrix, cut_intervals, nan_bins, \ correction_factors, distance_counts = matrixFileHandlerInput.load() log.debug('Setting done') if args.outputFormat in ['cool', 'h5', 'homer', 'ginteractions']: matrixFileHandlerOutput = MatrixFileHandler( pFileType=args.outputFormat) matrixFileHandlerOutput.set_matrix_variables( _matrix, cut_intervals, nan_bins, correction_factors, distance_counts) matrixFileHandlerOutput.save(args.outFileName[i] + '.' + args.outputFormat, pSymmetric=True, pApplyCorrection=applyCorrection) elif args.outputFormat in ['mcool']: log.debug('outformat is mcool') if args.resolutions and len(args.matrices) > 1: log.error( 'Please define either one matrix and many resolutions which should be created.' ) if args.resolutions: log.info( 'Correction factors are removed. They are not valid for any new created resolution' ) hic_matrix = HiCMatrix.hiCMatrix() hic_matrix.setMatrix(_matrix, cut_intervals) bin_size = hic_matrix.getBinSize() for resolution in args.resolutions: _mergeFactor = int(resolution) // bin_size merged_matrix = hicMergeMatrixBins.merge_bins( hic_matrix, _mergeFactor) matrixFileHandlerOutput = MatrixFileHandler( pFileType='cool', pEnforceInteger=args.enforce_integer) matrixFileHandlerOutput.set_matrix_variables( merged_matrix.matrix, merged_matrix.cut_intervals, merged_matrix.nan_bins, merged_matrix.correction_factors, merged_matrix.distance_counts) matrixFileHandlerOutput.save( args.outFileName[0] + '.mcool' + '::/resolutions/' + str(resolution), pSymmetric=True, pApplyCorrection=applyCorrection) else: hic_matrix = HiCMatrix.hiCMatrix() hic_matrix.setMatrix(_matrix, cut_intervals) bin_size = hic_matrix.getBinSize() matrixFileHandlerOutput = MatrixFileHandler( pFileType='cool') matrixFileHandlerOutput.set_matrix_variables( _matrix, cut_intervals, nan_bins, correction_factors, distance_counts) matrixFileHandlerOutput.save( args.outFileName[0] + '.mcool' + '::/resolutions/' + str(bin_size), pSymmetric=True, pApplyCorrection=applyCorrection)
def test_0_run_with_warnings(self): with captured_output() as (out, err): hic2cool_convert(self.infile_name, self.outfile_name, self.binsize, 1, True) read_err = err.getvalue().strip() self.assertTrue('WARNING' in read_err)
def test_convert_multiprocessing(self): hic2cool_convert(self.infile_name, self.outfile_name_all, 0, 2) assert os.stat(self.outfile_name_all).st_size == 6158552
def test_convert(self): hic2cool_convert(self.infile_name, self.outfile_name_all) assert os.stat(self.outfile_name_all).st_size == 6158552