def read_matrix(things, parser=None, hic=True): """ Read and checks a matrix from a file (using :func:`pytadbit.parser.hic_parser.autoreader`) or a list. :param things: might be either a file name, a file handler or a list of list (all with same length) :param None parser: a parser function that returns a tuple of lists representing the data matrix, with this file example.tsv: :: chrT_001 chrT_002 chrT_003 chrT_004 chrT_001 629 164 88 105 chrT_002 86 612 175 110 chrT_003 159 216 437 105 chrT_004 100 111 146 278 the output of parser('example.tsv') might be: ``([629, 86, 159, 100, 164, 612, 216, 111, 88, 175, 437, 146, 105, 110, 105, 278])`` :param True hic: if False, TADbit assumes that files contains normalized data :returns: the corresponding matrix concatenated into a huge list, also returns number or rows """ global HIC_DATA HIC_DATA = hic parser = parser or autoreader if not isinstance(things, list): things = [things] matrices = [] for thing in things: if isinstance(thing, HiC_data): matrices.append(thing) elif isinstance(thing, file): matrix, size = parser(thing) thing.close() matrices.append(HiC_data([(i, matrix[i]) for i in xrange(size**2) if matrix[i]], size)) elif isinstance(thing, str): try: matrix, size = parser(gzopen(thing)) except IOError: if len(thing.split('\n')) > 1: matrix, size = parser(thing.split('\n')) else: raise IOError('\n ERROR: file %s not found\n' % thing) matrices.append(HiC_data([(i, matrix[i]) for i in xrange(size**2) if matrix[i]], size)) elif isinstance(thing, list): if all([len(thing)==len(l) for l in thing]): matrix = reduce(lambda x, y: x+y, thing) size = len(thing) else: raise Exception('must be list of lists, all with same length.') matrices.append(HiC_data([(i, matrix[i]) for i in xrange(size**2) if matrix[i]], size)) elif isinstance(thing, tuple): # case we know what we are doing and passing directly list of tuples matrix = thing siz = sqrt(len(thing)) if int(siz) != siz: raise AttributeError('ERROR: matrix should be square.\n') size = int(siz) matrices.append(HiC_data([(i, matrix[i]) for i in xrange(size**2) if matrix[i]], size)) elif 'matrix' in str(type(thing)): try: row, col = thing.shape if row != col: raise Exception('matrix needs to be square.') matrix = thing.reshape(-1).tolist()[0] size = row except Exception as exc: print 'Error found:', exc matrices.append(HiC_data([(i, matrix[i]) for i in xrange(size**2) if matrix[i]], size)) else: raise Exception('Unable to read this file or whatever it is :)') return matrices
def read_matrix(things, parser=None, hic=True, resolution=1, **kwargs): """ Read and checks a matrix from a file (using :func:`pytadbit.parser.hic_parser.autoreader`) or a list. :param things: might be either a file name, a file handler or a list of list (all with same length) :param None parser: a parser function that returns a tuple of lists representing the data matrix, with this file example.tsv: :: chrT_001 chrT_002 chrT_003 chrT_004 chrT_001 629 164 88 105 chrT_002 86 612 175 110 chrT_003 159 216 437 105 chrT_004 100 111 146 278 the output of parser('example.tsv') might be: ``([629, 86, 159, 100, 164, 612, 216, 111, 88, 175, 437, 146, 105, 110, 105, 278])`` :param 1 resolution: resolution of the matrix :param True hic: if False, TADbit assumes that files contains normalized data :returns: the corresponding matrix concatenated into a huge list, also returns number or rows """ one = kwargs.get('one', True) global HIC_DATA HIC_DATA = hic if not isinstance(things, list): things = [things] matrices = [] for thing in things: if isinstance(thing, HiC_data): matrices.append(thing) elif isinstance(thing, file): parser = parser or (abc_reader if __is_abc(thing) else autoreader) matrix, size, header, masked, sym = parser(thing) print(header) thing.close() chromosomes, sections, resolution = _header_to_section( header, resolution) matrices.append( HiC_data(matrix, size, dict_sec=sections, chromosomes=chromosomes, resolution=resolution, symmetricized=sym, masked=masked)) elif isinstance(thing, str): if is_cooler(thing, resolution if resolution > 1 else None): matrix, size, header, masked, sym = parse_cooler( thing, resolution if resolution > 1 else None, not hic) else: try: parser = parser or (abc_reader if __is_abc(gzopen(thing)) else autoreader) matrix, size, header, masked, sym = parser(gzopen(thing)) except IOError: if len(thing.split('\n')) > 1: parser = parser or (abc_reader if __is_abc( thing.split('\n')) else autoreader) matrix, size, header, masked, sym = parser( thing.split('\n')) else: raise IOError('\n ERROR: file %s not found\n' % thing) sections = dict([(h, i) for i, h in enumerate(header)]) chromosomes, sections, resolution = _header_to_section( header, resolution) matrices.append( HiC_data(matrix, size, dict_sec=sections, chromosomes=chromosomes, masked=masked, resolution=resolution, symmetricized=sym)) elif isinstance(thing, list): if all([len(thing) == len(l) for l in thing]): size = len(thing) matrix = [(i + j * size, v) for i, l in enumerate(thing) for j, v in enumerate(l) if v] else: raise Exception('must be list of lists, all with same length.') matrices.append(HiC_data(matrix, size)) elif isinstance(thing, tuple): # case we know what we are doing and passing directly list of tuples matrix = thing siz = sqrt(len(thing)) if int(siz) != siz: raise AttributeError('ERROR: matrix should be square.\n') size = int(siz) matrices.append(HiC_data(matrix, size)) elif 'matrix' in str(type(thing)): try: row, col = thing.shape if row != col: raise Exception('matrix needs to be square.') matrix = thing.reshape(-1).tolist()[0] size = row except Exception as exc: print 'Error found:', exc matrices.append(HiC_data(matrix, size)) else: raise Exception('Unable to read this file or whatever it is :)') if one: return matrices[0] else: return matrices
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts, extra=['quiet']) coord1 = opts.coord1 if not coord1: region1 = None start1 = None end1 = None else: try: crm1, pos1 = coord1.split(':') start1, end1 = pos1.split('-') region1 = crm1 start1 = int(start1) end1 = int(end1) except ValueError: region1 = coord1 start1 = None end1 = None printime('Importing hic in %s format' % opts.format) if opts.format == 'matrix' or opts.format == 'text': with gzopen(opts.input) as f_thing: masked, chroms_gen, crm, beg, _, _ = read_file_header(f_thing) if not chroms_gen or (region1 and region1 not in chroms_gen): raise Exception( '''ERROR: Chromosome size not included in import file. Please include the chromosome sizes of the data that you want to import in the header of the file. Example: # CRM chr1 249250621''') elif opts.format == 'cooler': if is_cooler(opts.input, opts.reso if opts.reso > 1 else None): chroms_gen = parse_header(opts.input, opts.reso if opts.reso > 1 else None) if not chroms_gen or (region1 and region1 not in chroms_gen): raise Exception( '''ERROR: Chromosome size not included in import file. ''') else: raise Exception('''ERROR: The input file is not a cooler''') chroms = OrderedDict( (crm, int(chroms_gen[crm] // opts.reso) + 1) for crm in chroms_gen) sections = [] if not region1: size = 0 for crm in chroms: size += chroms[crm] sections.extend([(crm, i) for i in range(chroms[crm])]) elif not start1: size = chroms[region1] sections.extend([(region1, i) for i in range(size)]) else: #size = (end1 - start1)//opts.reso size = chroms[region1] sections.extend([ (region1, i) for i in range(start1 // opts.reso, (end1 // opts.reso)) ]) dict_sec = dict([(j, i) for i, j in enumerate(sections)]) bias_file = None badcol = {} if opts.format == 'text': with gzopen(opts.input) as f_thing: matrix = abc_reader(f_thing, size, start1 // opts.reso if start1 else None) size_mat = size elif opts.format == 'matrix': with gzopen(opts.input) as in_f: matrix, size_mat, _, masked, _ = autoreader(in_f) if size != size_mat: raise Exception('''ERROR: The size of the specified region is different from the data in the matrix''') elif opts.format == 'cooler': matrix, weights, size, header = parse_cooler( opts.input, opts.reso if opts.reso > 1 else None, normalized=True, raw_values=True) masked = {} size_mat = size if len(set(weights)) > 1: printime('Transforming cooler weights to biases') outdir_norm = path.join(opts.workdir, '04_normalization') mkdir(outdir_norm) bias_file = path.join( outdir_norm, 'biases_%s_%s.pickle' % (nicer(opts.reso).replace(' ', ''), param_hash)) out = open(bias_file, 'wb') badcol.update((i, True) for i, m in enumerate(weights) if m == 0) dump( { 'biases': dict((k, b if b > 0 else float('nan')) for k, b in enumerate(weights)), 'decay': {}, 'badcol': badcol, 'resolution': opts.reso }, out, HIGHEST_PROTOCOL) out.close() hic = HiC_data(matrix, size_mat, dict_sec=dict_sec, chromosomes=chroms, masked=masked, resolution=opts.reso) #from pytadbit.mapping.analyze import hic_map #hic_map(hic, normalized=False, focus='chr1', show=True, cmap='viridis') printime('Creating BAM file') outbam = path.join(opts.workdir, '03_filtered_reads', 'intersection_%s' % param_hash) total_counts = create_BAMhic(hic, opts.cpus, outbam, chroms_gen, opts.reso, samtools=opts.samtools) finish_time = time.localtime() # save all job information to sqlite DB save_to_db(opts, total_counts, size_mat, bias_file, len(badcol), outbam + '.bam', launch_time, finish_time)
def read_matrix(things, parser=None): """ Read and checks a matrix from a file (using :func:`pytadbit.parser.hic_parser.autoreader`) or a list. :param things: might be either a file name, a file handler, a list of them or a list of list (all with same length) :param None parser: a parser function that returns a tuple of lists representing the data matrix, with this file example.tsv: :: chrT_001 chrT_002 chrT_003 chrT_004 chrT_001 629 164 88 105 chrT_002 86 612 175 110 chrT_003 159 216 437 105 chrT_004 100 111 146 278 the output of parser('example.tsv') might be: ``([629, 86, 159, 100, 164, 612, 216, 111, 88, 175, 437, 146, 105, 110, 105, 278])`` :returns: the corresponding matrix concatenated into a huge list, also returns number or rows """ parser = parser or autoreader if type(things) is not list: things = [things] matrices = [] sizes = [] for thing in things: if type(thing) is file: matrix, size = parser(thing) thing.close() matrices.append(matrix) sizes.append(size) elif type(thing) is str: try: matrix, size = parser(gzopen(thing)) except IOError: if len(thing.split('\n')) > 1: matrix, size = parser(thing.split('\n')) else: raise Exception('\n ERROR: file %s not found\n' % thing) matrices.append(matrix) sizes.append(size) elif type(thing) is list: if all([len(thing)==len(l) for l in thing]): matrices.append(reduce(lambda x, y: x+y, thing)) sizes.append(len(thing)) else: raise Exception('must be list of lists, all with same length.') elif type(thing) is tuple: # case we know what we are doing and passing directly list of tuples matrices.append(thing) siz = sqrt(len(thing)) if int(siz) != siz: raise AttributeError('ERROR: matrix should be square.\n') sizes.append(int(siz)) elif 'matrix' in str(type(thing)): try: row, col = thing.shape if row != col: raise Exception('matrix needs to be square.') matrices.append(thing.reshape(-1).tolist()[0]) sizes.append(row) except Exception as exc: print 'Error found:', exc else: raise Exception('Unable to read this file or whatever it is :)') if all([s==sizes[0] for s in sizes]): return matrices, sizes[0] raise Exception('All matrices must have the same size ' + '(same chromosome and same bins).')
def read_matrix(things, parser=None, hic=True, resolution=1, **kwargs): """ Read and checks a matrix from a file (using :func:`pytadbit.parser.hic_parser.autoreader`) or a list. :param things: might be either a file name, a file handler or a list of list (all with same length) :param None parser: a parser function that returns a tuple of lists representing the data matrix, with this file example.tsv: :: chrT_001 chrT_002 chrT_003 chrT_004 chrT_001 629 164 88 105 chrT_002 86 612 175 110 chrT_003 159 216 437 105 chrT_004 100 111 146 278 the output of parser('example.tsv') might be: ``([629, 86, 159, 100, 164, 612, 216, 111, 88, 175, 437, 146, 105, 110, 105, 278])`` :param 1 resolution: resolution of the matrix :param True hic: if False, TADbit assumes that files contains normalized data :returns: the corresponding matrix concatenated into a huge list, also returns number or rows """ one = kwargs.get('one', True) global HIC_DATA HIC_DATA = hic if not isinstance(things, list): things = [things] matrices = [] for thing in things: if isinstance(thing, HiC_data): matrices.append(thing) elif isinstance(thing, file): parser = parser or (abc_reader if __is_abc(thing) else autoreader) matrix, size, header, masked, sym = parser(thing) print header thing.close() chromosomes, sections, resolution = _header_to_section(header, resolution) matrices.append(HiC_data(matrix, size, dict_sec=sections, chromosomes=chromosomes, resolution=resolution, symmetricized=sym, masked=masked)) elif isinstance(thing, str): try: parser = parser or (abc_reader if __is_abc(gzopen(thing)) else autoreader) matrix, size, header, masked, sym = parser(gzopen(thing)) except IOError: if len(thing.split('\n')) > 1: parser = parser or (abc_reader if __is_abc(thing.split('\n')) else autoreader) matrix, size, header, masked, sym = parser(thing.split('\n')) else: raise IOError('\n ERROR: file %s not found\n' % thing) sections = dict([(h, i) for i, h in enumerate(header)]) chromosomes, sections, resolution = _header_to_section(header, resolution) matrices.append(HiC_data(matrix, size, dict_sec=sections, chromosomes=chromosomes, masked=masked, resolution=resolution, symmetricized=sym)) elif isinstance(thing, list): if all([len(thing)==len(l) for l in thing]): size = len(thing) matrix = [(i + j * size, v) for i, l in enumerate(thing) for j, v in enumerate(l) if v] else: raise Exception('must be list of lists, all with same length.') matrices.append(HiC_data(matrix, size)) elif isinstance(thing, tuple): # case we know what we are doing and passing directly list of tuples matrix = thing siz = sqrt(len(thing)) if int(siz) != siz: raise AttributeError('ERROR: matrix should be square.\n') size = int(siz) matrices.append(HiC_data(matrix, size)) elif 'matrix' in str(type(thing)): try: row, col = thing.shape if row != col: raise Exception('matrix needs to be square.') matrix = thing.reshape(-1).tolist()[0] size = row except Exception as exc: print 'Error found:', exc matrices.append(HiC_data(matrix, size)) else: raise Exception('Unable to read this file or whatever it is :)') if one: return matrices[0] else: return matrices