示例#1
0
def test_save_cool_apply_division():
    cool_outfile = outfile + '.cool'

    # create matrixFileHandler instance with filetype 'cool'
    pMatrixFile = ROOT + 'Li_et_al_2015.cool'
    fh = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile, pCorrectionOperator='/')
    assert fh is not None

    # load data
    matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load()
    # set matrix variables
    fh_new = MatrixFileHandler(pFileType='cool', pCorrectionOperator='/')

    fh_new.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts)

    # and save it.

    fh_new.save(pName=cool_outfile, pSymmetric=False, pApplyCorrection=True)

    fh_test = MatrixFileHandler(pFileType='cool', pMatrixFile=cool_outfile, pCorrectionOperator='/')
    assert fh_test is not None
    matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, correction_factors_test = fh_test.load()
    pMatrixFile = ROOT + 'Li_et_al_2015.cool'
    fh = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile, pCorrectionOperator='/')
    assert fh is not None
    # load data
    matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load()

    nt.assert_almost_equal(matrix.data, matrix_test.data, decimal=1)
    nt.assert_equal(len(cut_intervals), len(cut_intervals_test))
    nt.assert_equal(nan_bins, nan_bins_test)
    nt.assert_equal(distance_counts, distance_counts_test)

    os.unlink(cool_outfile)
def test_load_cool_matrix_only():

    pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool'

    matrixFileHandlerInput = MatrixFileHandler(pFileType='cool',
                                               pMatrixFile=pMatrixFile,
                                               pLoadMatrixOnly=True)
    matrix, cut_intervals, nan_bins, \
        distance_counts, correction_factors = matrixFileHandlerInput.load()

    assert len(matrix) == 4
    assert cut_intervals is None
    assert nan_bins is None
    assert distance_counts is None
    assert correction_factors is None

    matrixFileHandlerInput2 = MatrixFileHandler(pFileType='cool',
                                                pMatrixFile=pMatrixFile)
    matrix2, cut_intervals2, nan_bins2, \
        distance_counts2, correction_factors2 = matrixFileHandlerInput2.load()

    instances, features = matrix2.nonzero()
    nt.assert_almost_equal(matrix[0], instances, decimal=1)
    nt.assert_almost_equal(matrix[1], features, decimal=1)
    nt.assert_almost_equal(matrix[2], matrix2.data, decimal=1)
    assert matrix[3] == matrix2.shape[0]
def test_load_distance_cool():
    cool_outfile = outfile + '.cool'

    # create matrixFileHandler instance with filetype 'cool'
    pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool'
    fh = MatrixFileHandler(pFileType='cool',
                           pMatrixFile=pMatrixFile,
                           pChrnameList=['1'],
                           pDistance=2500000)
    assert fh is not None

    # load data
    matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load(
    )
    # set matrix variables
    fh.set_matrix_variables(matrix, cut_intervals, nan_bins,
                            correction_factors, distance_counts)
    # and save it.
    fh.save(pName=cool_outfile, pSymmetric=True, pApplyCorrection=True)

    fh_test = MatrixFileHandler(pFileType='cool', pMatrixFile=cool_outfile)
    assert fh_test is not None
    matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, correction_factors_test = fh_test.load(
    )

    # check distance load works as expected
    instances, features = matrix.nonzero()
    distances = np.absolute(instances - features)
    # log.debug('max: {}'.format(np.max(distances)))
    mask = distances > 1  # 2.5 mb res --> all with  2.5 Mb distance
    assert np.sum(mask) == 0

    fh = MatrixFileHandler(pFileType='cool',
                           pChrnameList=['1'],
                           pMatrixFile=pMatrixFile)
    assert fh is not None

    # load data
    matrix2, _, _, _, _ = fh.load()
    instances, features = matrix2.nonzero()
    distances = np.absolute(instances - features)
    mask = distances > 1  # 2.5 mb res --> all with  2.5 Mb distance
    assert np.sum(mask) > 0

    # check if load and save matrix are equal
    nt.assert_equal(matrix.data, matrix_test.data)
    nt.assert_equal(cut_intervals, cut_intervals_test)
    nt.assert_equal(nan_bins, nan_bins_test)
    nt.assert_equal(distance_counts, distance_counts_test)
    nt.assert_equal(correction_factors, correction_factors_test)

    os.unlink(cool_outfile)
def compute_consensus_matrix(pMatrixName, pClusterMatricesList, pClusterName,
                             pQueue):
    counter = 0
    consensus_matrix = None
    try:
        matrixFileHandlerInput = MatrixFileHandler(
            pFileType='cool',
            pMatrixFile=pMatrixName + '::' + pClusterMatricesList[0])
        _matrix, cut_intervals, nan_bins, \
            distance_counts, correction_factors = matrixFileHandlerInput.load()
        consensus_matrix = _matrix

        for j, matrix in enumerate(pClusterMatricesList[1:]):

            matrixFileHandlerInput = MatrixFileHandler(
                pFileType='cool',
                pMatrixFile=pMatrixName + '::' + matrix,
                pLoadMatrixOnly=True)
            _matrix, _, _, _, _ = matrixFileHandlerInput.load()

            _matrix = csr_matrix((_matrix[2], (_matrix[0], _matrix[1])),
                                 (_matrix[3], _matrix[3]),
                                 dtype=np.float)

            if consensus_matrix is None:
                consensus_matrix = _matrix
            else:
                consensus_matrix += _matrix

        hic2CoolVersion = matrixFileHandlerInput.matrixFile.hic2cool_version
        matrixFileHandlerOutput = MatrixFileHandler(
            pFileType='cool',
            pMatrixFile='consensus_matrix_cluster_' + str(pClusterName) + ':' +
            str(len(pClusterMatricesList)),
            pEnforceInteger=False,
            pFileWasH5=False,
            pHic2CoolVersion=hic2CoolVersion)

        matrixFileHandlerOutput.set_matrix_variables(consensus_matrix,
                                                     cut_intervals, nan_bins,
                                                     correction_factors,
                                                     distance_counts)
        if counter > 0:
            log.info(
                '{} matrices were not considered because of a wrong size.'.
                format(counter))
    except Exception as exp:
        log.debug('exception! {}'.format(str(exp)))
    log.debug('computaiton of {} done'.format(str(pClusterName)))
    pQueue.put(matrixFileHandlerOutput)
示例#5
0
def test_load_hicpro(capsys):
    # create matrixFileHandler instance with filetype 'hicpro'
    pMatrixFile = ROOT + 'test_matrix.hicpro'
    pBedFileHicPro = ROOT + 'test_matrix.bed'
    fh = MatrixFileHandler(pFileType='hicpro', pMatrixFile=pMatrixFile, pBedFileHicPro=pBedFileHicPro)
    assert fh is not None

    # load data
    matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load()

    # create test matrix
    test_list = [0. for i in range(3113)]
    test_list.insert(0, 41.345793)
    test_list[827] = 5.42079
    test_list[1263] = 5.122642

    test_matrix = np.array([test_list])

    # and check for shape and values
    assert matrix[0].todense().shape == test_matrix.shape
    nt.assert_almost_equal(matrix[0].todense(), test_matrix)

    test_cut_intervals = np.array([('chr1', 0, 1000000, 1), ('chr1', 1000000, 2000000, 2), ('chr1', 2000000, 3000000, 3),
                                   ('chr1', 3000000, 4000000, 4), ('chr1', 4000000, 5000000, 5), ('chr1', 5000000, 6000000, 6),
                                   ('chr1', 6000000, 7000000, 7), ('chr1', 7000000, 8000000, 8), ('chr1', 8000000, 9000000, 9),
                                   ('chr1', 9000000, 10000000, 10), ('chr1', 10000000, 11000000, 11), ('chr1', 11000000, 12000000, 12),
                                   ('chr1', 12000000, 13000000, 13), ('chr1', 13000000, 14000000, 14), ('chr1', 14000000, 15000000, 15),
                                   ('chr1', 15000000, 16000000, 16), ('chr1', 16000000, 17000000, 17), ('chr1', 17000000, 18000000, 18),
                                   ('chr1', 18000000, 19000000, 19), ('chr1', 19000000, 20000000, 20)])
    nt.assert_equal(cut_intervals[0:20], test_cut_intervals)

    assert nan_bins is None
    assert correction_factors is None
    assert distance_counts is None
示例#6
0
def compute_read_coverage_sparsity(pMatrixName, pMatricesList, pXDimension, pMaximumRegionToConsider, pQueue):
    read_coverage = []
    sparsity = []

    log.debug('read covarage and sparsity')
    hic_ma = hm.hiCMatrix(pMatrixFile=pMatrixName + '::' + pMatricesList[0])
    bin_size = hic_ma.getBinSize()
    shape_x = hic_ma.matrix.shape[0]
    for i, matrix in enumerate(pMatricesList):

        matrixFileHandler = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixName + '::' + matrix, pLoadMatrixOnly=True)
        _matrix, cut_intervals, nan_bins, \
            distance_counts, correction_factors = matrixFileHandler.load()
        max_distance = pMaximumRegionToConsider // bin_size
        instances = _matrix[0]
        features = _matrix[1]

        distances = np.absolute(instances - features)
        mask = distances <= max_distance
        sparsity_length = len(_matrix[2][mask])

        sparsity.append(sparsity_length / (shape_x * max_distance))

        # only upper half is loaded --> times 2
        read_coverage_sum = _matrix[2].sum() * 2
        # minus the double main diagonal
        mask = distances == 0
        read_coverage_sum -= _matrix[2][mask].sum()
        read_coverage.append(read_coverage_sum)

    pQueue.put([read_coverage, sparsity])
示例#7
0
def test_hicConvertFormat_h5_to_homer():

    outfile = NamedTemporaryFile(suffix='.homer', delete=False)
    outfile.close()

    args = "--matrices {} --outFileName {} --inputFormat cool --outputFormat homer ".format(
        original_matrix_cool_chr4, outfile.name).split()
    # hicConvertFormat.main(args)
    compute(hicConvertFormat.main, args, 5)

    test = hm.hiCMatrix(original_matrix_cool_chr4)
    f = gzip.open(outfile.name, 'rb')
    file_content = f.read()
    outfile2 = NamedTemporaryFile(suffix='.homer', delete=False)
    outfile2.close()
    with open(outfile2.name, 'wb') as matrix_file:
        matrix_file.write(file_content)

    matrixFileHandlerInput = MatrixFileHandler(pFileType='homer',
                                               pMatrixFile=outfile2.name)

    _matrix, cut_intervals, nan_bins, \
        distance_counts, correction_factors = matrixFileHandlerInput.load()

    nt.assert_array_almost_equal(test.matrix.data, _matrix.data, decimal=0)
示例#8
0
def compute_consensus_matrix(pMatrixName, pClusterMatricesList, pAppend,
                             pQueue):
    cluster_consensus_matrices_list = []
    for i, cluster in enumerate(pClusterMatricesList):
        consensus_matrix = None
        if i == 0 and pAppend:
            append = False
        else:
            append = True
        for matrix in cluster:
            matrixFileHandlerInput = MatrixFileHandler(
                pFileType='cool', pMatrixFile=pMatrixName + '::' + matrix)
            _matrix, cut_intervals, nan_bins, \
                distance_counts, correction_factors = matrixFileHandlerInput.load()

            if consensus_matrix is None:
                consensus_matrix = _matrix
            else:
                consensus_matrix += _matrix

        hic2CoolVersion = matrixFileHandlerInput.matrixFile.hic2cool_version
        matrixFileHandlerOutput = MatrixFileHandler(
            pFileType='cool',
            pAppend=append,
            pEnforceInteger=False,
            pFileWasH5=False,
            pHic2CoolVersion=hic2CoolVersion)

        matrixFileHandlerOutput.set_matrix_variables(consensus_matrix,
                                                     cut_intervals, nan_bins,
                                                     correction_factors,
                                                     distance_counts)
        cluster_consensus_matrices_list.append(matrixFileHandlerOutput)

    pQueue.put(cluster_consensus_matrices_list)
示例#9
0
def test_load_cool(capsys):
    # create matrixFileHandler instance with filetype 'cool'
    pMatrixFile = ROOT + 'Li_et_al_2015.cool'
    fh = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile)
    assert fh is not None

    # load data
    matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load()

    # test matrix
    test_matrix = np.array([[0. for i in range(11104)]])
    nt.assert_almost_equal(matrix[0].todense(), test_matrix)

    test_cut_intervals = [('X', 0, 2200, 1.0), ('X', 2200, 4702, 1.0), ('X', 4702, 7060, 1.0),
                          ('X', 7060, 8811, 1.0), ('X', 8811, 11048, 1.0), ('X', 11048, 14329, 1.0),
                          ('X', 14329, 16847, 1.0), ('X', 16847, 19537, 1.0), ('X', 19537, 20701, 1.0),
                          ('X', 20701, 22321, 1.0), ('X', 22321, 24083, 1.0), ('X', 24083, 25983, 1.0),
                          ('X', 25983, 27619, 1.0), ('X', 27619, 29733, 1.0), ('X', 29733, 30973, 1.0),
                          ('X', 30973, 32214, 1.0), ('X', 32214, 34179, 1.0), ('X', 34179, 35987, 1.0),
                          ('X', 35987, 37598, 1.0), ('X', 37598, 39009, 1.0)]
    for index, tup in enumerate(cut_intervals[0:20]):
        for ind, element in enumerate(tup):
            assert element == test_cut_intervals[index][ind]

    test_nan_bins = [0, 1, 2, 3, 4, 5, 6, 7, 30, 31]
    nt.assert_almost_equal(nan_bins[0:10], test_nan_bins)

    test_correction_factors = [0., 0., 0., 0., 0., 0., 0., 0., 1.1022922, 0.796711]
    nt.assert_almost_equal(correction_factors[0:10], test_correction_factors)

    assert distance_counts is None
示例#10
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    log.debug(args)
    matrix_file_handler_object_list = []
    matrixFileHandlerInput = MatrixFileHandler(pFileType='cool',
                                               pMatrixFile=args.matrices[0])

    _matrix, cut_intervals_all, nan_bins, \
        distance_counts, correction_factors = matrixFileHandlerInput.load()

    matrices_list = args.matrices

    threads = args.threads

    matrixFileHandler_list = [None] * args.threads
    process = [None] * args.threads
    queue = [None] * args.threads

    thread_done = [False] * args.threads
    matricesPerThread = len(matrices_list) // threads

    for i in range(args.threads):
        if i < threads - 1:
            matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) *
                                               matricesPerThread]
        else:
            matrices_name_list = matrices_list[i * matricesPerThread:]

        queue[i] = Queue()
        process[i] = Process(target=load_cool_files,
                             kwargs=dict(pMatricesList=matrices_name_list,
                                         pCutIntervals=cut_intervals_all,
                                         pQueue=queue[i]))
        process[i].start()

    all_data_collected = False
    while not all_data_collected:
        for i in range(threads):
            if queue[i] is not None and not queue[i].empty():
                matrixFileHandler_list[i] = queue[i].get()
                queue[i] = None
                process[i].join()
                process[i].terminate()
                process[i] = None
                thread_done[i] = True
        all_data_collected = True
        for thread in thread_done:
            if not thread:
                all_data_collected = False
        time.sleep(1)

    matrix_file_handler_object_list = [
        item for sublist in matrixFileHandler_list for item in sublist
    ]

    matrixFileHandler = MatrixFileHandler(pFileType='scool')
    matrixFileHandler.matrixFile.coolObjectsList = matrix_file_handler_object_list
    matrixFileHandler.save(args.outFileName,
                           pSymmetric=True,
                           pApplyCorrection=False)
def test_load_cool_hic2cool_versions():
    pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool042.cool'
    hic2cool_042 = MatrixFileHandler(pFileType='cool',
                                     pMatrixFile=pMatrixFile,
                                     pCorrectionFactorTable='KR',
                                     pCorrectionOperator='*')
    pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool'
    hic2cool_051 = MatrixFileHandler(pFileType='cool',
                                     pMatrixFile=pMatrixFile,
                                     pCorrectionFactorTable='KR')

    # hic2cool_051 = MatrixFileHandler(pFileType='h5', pMatrixFile=, pCorrectionFactorTable='KR')
    # hic2cool_042 = hm.hiCMatrix(ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool042.cool')
    # hic2cool_051 = hm.hiCMatrix(ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool')

    # hic2cool_041 = hm.hiCMatrix(outfile.name)
    matrix, cut_intervals, nan_bins, distance_counts, correction_factors = hic2cool_042.load(
    )
    matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, correction_factors_test = hic2cool_051.load(
    )

    nt.assert_almost_equal(matrix.data, matrix_test.data, decimal=0)
    nt.assert_equal(len(cut_intervals), len(cut_intervals_test))
    nt.assert_equal(nan_bins, nan_bins_test)
    nt.assert_equal(distance_counts, distance_counts_test)
示例#12
0
def load_cool_files(pMatrixName, pMatricesList, pCutIntervals, pQueue):

    matrixFileHandlerList = []
    try:
        for i, matrix in enumerate(pMatricesList):

            matrixFileHandlerInput = MatrixFileHandler(
                pFileType='cool',
                pMatrixFile=pMatrixName + "::" + matrix,
                pNoCutIntervals=True)

            _matrix, cut_intervals, nan_bins, \
                distance_counts, correction_factors = matrixFileHandlerInput.load()

            matrixFileHandlerOutput = MatrixFileHandler(
                pFileType='cool', pMatrixFile=matrix.split('/')[-1])

            matrixFileHandlerOutput.set_matrix_variables(
                _matrix, pCutIntervals, nan_bins, correction_factors,
                distance_counts)

            matrixFileHandlerList.append(matrixFileHandlerOutput)
    except Exception as exp:
        pQueue.put('Fail: ' + str(exp) + traceback.format_exc())
        return
    pQueue.put(matrixFileHandlerList)
示例#13
0
def test_load_h5(capsys):
    # create matrixFileHandler instance with filetype 'h5'
    pMatrixFile = ROOT + 'Li_et_al_2015.h5'
    fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile)
    assert fh is not None

    # load data
    matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load(
    )

    test_matrix = np.array([[0. for i in range(11104)]])
    nt.assert_almost_equal(matrix[0].todense(), test_matrix)

    nt.assert_equal(cut_intervals[0], ('X', 0, 2200, 0.0))
    nt.assert_equal(cut_intervals[1], ('X', 2200, 4702, 0.0))
    nt.assert_equal(cut_intervals[2], ('X', 4702, 7060, 0.0))
    nt.assert_equal(cut_intervals[3], ('X', 7060, 8811, 0.4))

    test_nan_bins = np.array([
        0, 1, 2, 3, 4, 5, 6, 7, 30, 31, 32, 51, 52, 53, 54, 81, 82, 83, 84, 94
    ])  # noqa E501
    nt.assert_equal(nan_bins[0:20], test_nan_bins)

    assert distance_counts is None

    test_correction_factors = np.array(
        [0, 0, 0, 0, 0, 0, 0, 0, 0.90720049, 1.25516028])  # noqa E501
    nt.assert_almost_equal(correction_factors[0:10], test_correction_factors)
示例#14
0
def load_cool_files(pMatricesList, pCutIntervals, pQueue):

    matrixFileHandlerList = []
    for i, matrix in enumerate(pMatricesList):
        try:
            matrixFileHandlerInput = MatrixFileHandler(pFileType='cool',
                                                       pMatrixFile=matrix,
                                                       pNoCutIntervals=True)

            _matrix, cut_intervals, nan_bins, \
                distance_counts, correction_factors = matrixFileHandlerInput.load()

            matrixFileHandlerOutput = MatrixFileHandler(pFileType='cool',
                                                        pMatrixFile=matrix)

            matrixFileHandlerOutput.set_matrix_variables(
                _matrix, pCutIntervals, nan_bins, correction_factors,
                distance_counts)

            matrixFileHandlerList.append(matrixFileHandlerOutput)
        except Exception as exp:
            log.warning(
                'File could not be opend and is excluded: {}. Error message: {} '
                .format(matrix, str(exp)))

    pQueue.put(matrixFileHandlerList)
def test_save_cool_enforce_integer():
    cool_outfile = outfile + '.cool'

    # create matrixFileHandler instance with filetype 'h5'
    pMatrixFile = ROOT + 'Li_et_al_2015.h5'
    fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile)
    assert fh is not None

    # load data
    matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load(
    )

    # set matrix variables
    fh_new = MatrixFileHandler(pFileType='cool', pEnforceInteger=True)

    fh_new.set_matrix_variables(matrix, cut_intervals, nan_bins,
                                correction_factors, distance_counts)
    fh_new.matrixFile.fileWasH5 = True
    # and save it.

    fh_new.save(pName=cool_outfile, pSymmetric=False, pApplyCorrection=True)

    fh_test = MatrixFileHandler(pFileType='cool',
                                pMatrixFile=cool_outfile,
                                pApplyCorrectionCoolerLoad=False)
    assert fh_test is not None
    matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, correction_factors_test = fh_test.load(
    )

    # pMatrixFile = ROOT + 'Li_et_al_2015.h5'
    # fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile)
    # assert fh is not None

    # load data
    # matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load()
    # instances, features = matrix.nonzero()
    # instances_factors = correction_factors[instances]
    # features_factors = correction_factors[features]
    # instances_factors *= features_factors

    # matrix_applied_correction = matrix.data / instances_factors
    # mask = matrix.data == 0
    matrix.data = np.rint(matrix.data)
    matrix.eliminate_zeros()
    # matrix_test.eliminate_zeros()

    nt.assert_almost_equal(matrix.data, matrix_test.data, decimal=0)
    nt.assert_equal(len(cut_intervals), len(cut_intervals_test))
    nt.assert_equal(nan_bins, nan_bins_test)
    nt.assert_equal(distance_counts, distance_counts_test)

    # os.unlink(cool_outfile)
    os.unlink(cool_outfile)
def test_save_scool_matrixHandlersCool():

    outfile = NamedTemporaryFile(suffix='.scool',
                                 prefix='hicmatrix_scool_test')

    pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool'

    matrixFileHandlerInput = MatrixFileHandler(pFileType='cool',
                                               pMatrixFile=pMatrixFile)
    matrix, cut_intervals, nan_bins, \
        distance_counts, correction_factors = matrixFileHandlerInput.load()
    matrixFileHandlerOutput1 = MatrixFileHandler(pFileType='cool',
                                                 pMatrixFile='cell1',
                                                 pEnforceInteger=False,
                                                 pFileWasH5=False,
                                                 pHic2CoolVersion=None)
    matrixFileHandlerOutput1.set_matrix_variables(matrix, cut_intervals,
                                                  nan_bins, correction_factors,
                                                  distance_counts)

    matrixFileHandlerOutput2 = MatrixFileHandler(pFileType='cool',
                                                 pMatrixFile='cell2',
                                                 pEnforceInteger=False,
                                                 pFileWasH5=False,
                                                 pHic2CoolVersion=None)
    matrixFileHandlerOutput2.set_matrix_variables(matrix, cut_intervals,
                                                  nan_bins, correction_factors,
                                                  distance_counts)

    matrixFileHandlerOutput3 = MatrixFileHandler(pFileType='cool',
                                                 pMatrixFile='cell3',
                                                 pEnforceInteger=False,
                                                 pFileWasH5=False,
                                                 pHic2CoolVersion=None)
    matrixFileHandlerOutput3.set_matrix_variables(matrix, cut_intervals,
                                                  nan_bins, correction_factors,
                                                  distance_counts)

    matrixFileHandler = MatrixFileHandler(pFileType='scool')
    matrixFileHandler.matrixFile.coolObjectsList = [
        matrixFileHandlerOutput1, matrixFileHandlerOutput2,
        matrixFileHandlerOutput3
    ]

    matrixFileHandler.save(outfile.name,
                           pSymmetric=True,
                           pApplyCorrection=False)

    content_of_scool = cooler.fileops.list_scool_cells(outfile.name)
    content_expected = ['/cells/cell1', '/cells/cell2', '/cells/cell3']
    for content in content_expected:
        assert content in content_of_scool
示例#17
0
def test_save_homer():
    homer_outfile = outfile + '.homer'

    # create matrixFileHandler instance with filetype 'homer'
    pMatrixFile = ROOT + 'test_matrix.homer'
    fh = MatrixFileHandler(pFileType='homer', pMatrixFile=pMatrixFile)
    assert fh is not None

    # load data
    matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load()
    # set matrix variables
    fh.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts)  # noqa E501
    # and save it.
    fh.save(pName=homer_outfile, pSymmetric=False, pApplyCorrection=False)  # not implemented
    os.unlink(homer_outfile)
示例#18
0
def compute_sum(pMatrixName, pMatricesList, pThread, pQueue):
    sum_list = []
    for i, matrix in enumerate(pMatricesList):

        matrixFileHandler = MatrixFileHandler(pFileType='cool',
                                              pMatrixFile=pMatrixName + '::' +
                                              matrix)
        _matrix, cut_intervals, nan_bins, \
            distance_counts, correction_factors = matrixFileHandler.load()
        # try:
        sum_of_matrix = _matrix.sum()
        # except:
        # sum_list.append()
        sum_list.append(sum_of_matrix)
    pQueue.put(sum_list)
示例#19
0
def test_load_h5_save_cool():
    cool_outfile = outfile + '.cool'

    # create matrixFileHandler instance with filetype 'h5'
    pMatrixFile = ROOT + 'Li_et_al_2015.h5'
    fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile)
    assert fh is not None

    # load data
    matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load(
    )

    # set matrix variables
    fh_new = MatrixFileHandler(pFileType='cool')

    fh_new.set_matrix_variables(matrix, cut_intervals, nan_bins,
                                correction_factors, distance_counts)
    fh_new.matrixFile.fileWasH5 = True
    # and save it.

    fh_new.save(pName=cool_outfile, pSymmetric=False, pApplyCorrection=True)

    fh_test = MatrixFileHandler(pFileType='cool', pMatrixFile=cool_outfile)
    assert fh_test is not None
    matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, correction_factors_test = fh_test.load(
    )

    instances, features = matrix.nonzero()
    instances_factors = correction_factors[instances]
    features_factors = correction_factors[features]
    instances_factors *= features_factors

    matrix_applied_correction = matrix.data / instances_factors
    nt.assert_almost_equal(matrix_applied_correction,
                           matrix_test.data,
                           decimal=1)
    nt.assert_equal(len(cut_intervals), len(cut_intervals_test))
    nt.assert_equal(nan_bins, nan_bins_test)
    nt.assert_equal(distance_counts, distance_counts_test)
    correction_factors = 1 / correction_factors
    mask = np.isnan(correction_factors)
    correction_factors[mask] = 0
    mask = np.isinf(correction_factors)
    correction_factors[mask] = 0
    nt.assert_equal(correction_factors, correction_factors_test)

    # os.unlink(cool_outfile)
    os.unlink(cool_outfile)
示例#20
0
def test_save_h5():
    h5_outfile = outfile + '.h5'

    # create matrixFileHandler instance with filetype 'h5'
    pMatrixFile = ROOT + 'Li_et_al_2015.h5'
    fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile)
    assert fh is not None

    # load data
    matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load()
    # set matrix variables
    fh.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts)  # noqa E501
    # and save it.
    fh.save(h5_outfile, True, None)

    os.unlink(h5_outfile)
示例#21
0
def compute_normalize(pMatrixName, pMatricesList, pArgminSum, pSumOfAll,
                      pAppend, pQueue):

    matrixFileHandlerList = []
    for i, matrix in enumerate(pMatricesList):
        if i == 0 and pAppend:
            append = False
        else:
            append = True
        matrixFileHandler = MatrixFileHandler(pFileType='cool',
                                              pMatrixFile=pMatrixName + '::' +
                                              matrix)
        _matrix, cut_intervals, nan_bins, \
            distance_counts, correction_factors = matrixFileHandler.load()
        _matrix.data = _matrix.data.astype(np.float32)
        mask = np.isnan(_matrix.data)
        _matrix.data[mask] = 0

        mask = np.isinf(_matrix.data)
        _matrix.data[mask] = 0
        adjust_factor = pSumOfAll[i] / pArgminSum
        _matrix.data /= adjust_factor
        mask = np.isnan(_matrix.data)

        mask = np.isnan(_matrix.data)
        _matrix.data[mask] = 0

        mask = np.isinf(_matrix.data)
        _matrix.data[mask] = 0
        _matrix.eliminate_zeros()

        matrixFileHandlerOutput = MatrixFileHandler(pFileType='cool',
                                                    pAppend=append,
                                                    pEnforceInteger=False,
                                                    pFileWasH5=False,
                                                    pHic2CoolVersion=None)

        matrixFileHandlerOutput.set_matrix_variables(_matrix, cut_intervals,
                                                     nan_bins,
                                                     correction_factors,
                                                     distance_counts)

        matrixFileHandlerList.append(matrixFileHandlerOutput)

    pQueue.put(matrixFileHandlerList)
示例#22
0
def test_hicConvertFormat_2D_text_to_cool():

    outfile = NamedTemporaryFile(suffix='.cool', delete=False)
    outfile.close()
    text_2d = ROOT + '/GSM1436265_RAD21_ENCFF002EMQ.txt'
    args = "--matrices {} --outFileName {} --inputFormat 2D-text --outputFormat cool -r 10000 --chromosomeSizes {}".format(
        text_2d, outfile.name, ROOT + '/hg19.chrom.sizes').split()
    compute(hicConvertFormat.main, args, 5)

    new = hm.hiCMatrix(outfile.name)

    matrixFileHandlerInput = MatrixFileHandler(pFileType='cool',
                                               pMatrixFile=ROOT +
                                               '/2dtexttocool.cool')

    _matrix, cut_intervals, nan_bins, \
        distance_counts, correction_factors = matrixFileHandlerInput.load()

    new.matrix = triu(new.matrix)
    nt.assert_array_almost_equal(new.matrix.data, _matrix.data, decimal=0)
示例#23
0
def test_load_cool2(capsys):
    # create matrixFileHandler instance with filetype 'cool'
    pMatrixFile = ROOT + 'one_interaction_4chr.cool'
    # The interaction is:
    # chr1	10000	chr1	200000
    bin_size = 50000
    # So there should be a 1 between the bin 0 and the bin 3
    fh = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile)
    assert fh is not None

    # load data
    matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load(
    )

    # test data
    nt.assert_almost_equal(matrix.data, np.array([1]))

    # test matrix
    test_matrix = np.array([[0 for i in range(9167)]])
    nt.assert_almost_equal(matrix[3].todense(), test_matrix)
    test_matrix[0][3] = 1
    nt.assert_almost_equal(matrix[0].todense(), test_matrix)

    test_cut_intervals = sum(
        [[('chr1', i * bin_size, (i + 1) * bin_size, 1.0)
          for i in range(3909)], [('chr1', 195450000, 195471971, 1.0)],
         [('chrX', i * bin_size, (i + 1) * bin_size, 1.0)
          for i in range(3420)], [('chrX', 171000000, 171031299, 1.0)],
         [('chrY', i * bin_size, (i + 1) * bin_size, 1.0)
          for i in range(1834)], [('chrY', 91700000, 91744698, 1.0)],
         [('chrM', 0, 16299, 1.0)]], [])

    for index, tup in enumerate(cut_intervals):
        for ind, element in enumerate(tup):
            assert element == test_cut_intervals[index][ind]

    test_nan_bins = [0, 1, 2, 4]
    nt.assert_almost_equal(nan_bins[:4], test_nan_bins)

    assert distance_counts is None
    assert correction_factors is None
示例#24
0
def test_hicConvertFormat_hicpro_to_cool():

    outfile = NamedTemporaryFile(suffix='.cool', delete=False)
    outfile.close()
    hicprofile = ROOT + '/test_matrix.hicpro'
    bedfile = ROOT + '/test_matrix.bed'
    args = "--matrices {} --outFileName {} --inputFormat hicpro --outputFormat cool --bedFileHicpro {}".format(
        hicprofile, outfile.name, bedfile).split()
    compute(hicConvertFormat.main, args, 5)

    new = hm.hiCMatrix(outfile.name)

    matrixFileHandlerInput = MatrixFileHandler(pFileType='hicpro',
                                               pMatrixFile=hicprofile,
                                               pBedFileHicPro=bedfile)

    _matrix, cut_intervals, nan_bins, \
        distance_counts, correction_factors = matrixFileHandlerInput.load()

    new.matrix = triu(new.matrix)
    nt.assert_array_almost_equal(new.matrix.data, _matrix.data, decimal=0)
示例#25
0
def test_load_homer(capsys):
    # create matrixFileHandler instance with filetype 'homer'
    pMatrixFile = ROOT + 'test_matrix.homer'
    fh = MatrixFileHandler(pFileType='homer', pMatrixFile=pMatrixFile)
    assert fh is not None

    # load data
    matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load()

    # create test matrix

    test_matrix = np.array([[1.0, 0.1896, 0.2163, 0.08288, 0.1431, 0.2569, 0.1315,
                             0.1488, -0.0312, 0.143, 0.06091, 0.03546, 0.1168]])

    nt.assert_almost_equal(matrix[0].todense(), test_matrix)

    test_cut_intervals = [('3R', 1000000, 1020000, 1), ('3R', 1020000, 1040000, 1), ('3R', 1040000, 1060000, 1), ('3R', 1060000, 1080000, 1), ('3R', 1080000, 1100000, 1), ('3R', 1100000, 1120000, 1), ('3R', 1120000, 1140000, 1), ('3R', 1140000, 1160000, 1), ('3R', 1160000, 1180000, 1), ('3R', 1180000, 1200000, 1), ('3R', 1200000, 1220000, 1), ('3R', 1220000, 1240000, 1), ('3R', 1240000, 1260000, 1)]  # noqa E501
    nt.assert_equal(cut_intervals, test_cut_intervals)

    assert nan_bins is None
    assert distance_counts is None
    assert correction_factors is None
示例#26
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    log.debug(args)
    matrix_file_handler_object_list = []

    matrices_list = cell_name_list(args.matrix)
    if args.action in ['extractToCool', 'extractScool']:
        if args.cellList is not None:
            matrix_list_tmp = []
            with open(args.cellList, 'r') as file:
                for line in file:
                    values = line.strip()
                    log.debug('values {}'.format(values))
                    if not values.startswith('/cells'):
                        values = '/cells/' + values
                    if values in matrices_list:
                        matrix_list_tmp.append(values)

            matrices_list = matrix_list_tmp

    if len(matrices_list) == 0:
        raise OSError('No cells for processing. Terminating.')
        exit(1)
    if len(matrices_list) < args.threads:
        args.threads = len(matrices_list)

    matrixFileHandlerInput = MatrixFileHandler(pFileType='cool',
                                               pMatrixFile=args.matrix + "::" +
                                               matrices_list[0])

    _matrix, cut_intervals_all, nan_bins, \
        distance_counts, correction_factors = matrixFileHandlerInput.load()

    threads = args.threads

    matrixFileHandler_list = [None] * args.threads
    process = [None] * args.threads
    queue = [None] * args.threads

    thread_done = [False] * args.threads
    matricesPerThread = len(matrices_list) // threads

    for i in range(args.threads):
        if i < threads - 1:
            matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) *
                                               matricesPerThread]
        else:
            matrices_name_list = matrices_list[i * matricesPerThread:]

        queue[i] = Queue()
        process[i] = Process(target=load_cool_files,
                             kwargs=dict(pMatrixName=args.matrix,
                                         pMatricesList=matrices_name_list,
                                         pCutIntervals=cut_intervals_all,
                                         pQueue=queue[i]))
        process[i].start()

    all_data_collected = False
    fail_flag = False
    fail_message = ''
    while not all_data_collected:
        for i in range(threads):
            if queue[i] is not None and not queue[i].empty():
                matrixFileHandler_list[i] = queue[i].get()
                if 'Fail:' in matrixFileHandler_list[i]:
                    fail_flag = True
                    fail_message = matrixFileHandler_list[i][6:]
                queue[i] = None
                process[i].join()
                process[i].terminate()
                process[i] = None
                thread_done[i] = True
        all_data_collected = True
        for thread in thread_done:
            if not thread:
                all_data_collected = False
        time.sleep(1)

    if fail_flag:
        log.error(fail_message)
        exit(1)
    matrix_file_handler_object_list = [
        item for sublist in matrixFileHandler_list for item in sublist
    ]

    if args.action in ['extractScool', 'update']:
        matrixFileHandler = MatrixFileHandler(pFileType='scool')
        matrixFileHandler.matrixFile.coolObjectsList = matrix_file_handler_object_list
        matrixFileHandler.save(args.outFileName,
                               pSymmetric=True,
                               pApplyCorrection=False)
    else:
        if not os.path.exists(args.outFileName):
            try:
                os.makedirs(args.outFileName)
            except OSError as exc:  # Guard against race condition
                if exc.errno != errno.EEXIST:
                    raise
        for matrixFileHandler in matrix_file_handler_object_list:
            matrixFileHandler.save(
                args.outFileName + '/' +
                matrixFileHandler.matrixFile.matrixFileName + '.cool',
                pApplyCorrection=True,
                pSymmetric=True)
示例#27
0
def main(args=None):

    args = parse_arguments().parse_args(args)

    threads = args.threads
    matrixFileHandler_list = [None] * threads
    matrices_list = cell_name_list(args.matrix)
    if len(matrices_list) < threads:
        threads = len(matrices_list)

    matrixFileHandlerInput = MatrixFileHandler(pFileType='cool', pMatrixFile=args.matrix + "::" + matrices_list[0])

    _matrix, cut_intervals_all, nan_bins, \
        distance_counts, correction_factors = matrixFileHandlerInput.load()

    all_data_collected = False
    thread_done = [False] * threads
    length_index = [None] * threads
    length_index[0] = 0
    matricesPerThread = len(matrices_list) // threads
    queue = [None] * threads
    process = [None] * threads
    print('Threads: ' + str(threads))
    for i in range(threads):

        if i < threads - 1:
            matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) * matricesPerThread]
            length_index[i + 1] = length_index[i] + len(matrices_name_list)
        else:
            matrices_name_list = matrices_list[i * matricesPerThread:]

        queue[i] = Queue()
        process[i] = Process(target=compute_correction, kwargs=dict(
            pMatrixName=args.matrix,
            pMatrixList=matrices_name_list,
            pCutIntervals=cut_intervals_all,
            pQueue=queue[i]
        )
        )

        process[i].start()

    fail_flag = False
    while not all_data_collected:
        for i in range(threads):
            if queue[i] is not None and not queue[i].empty():
                matrixFileHandler_list[i] = queue[i].get()
                # csr_matrix_worker = queue[i].get()
                if isinstance(matrixFileHandler_list[i], str):
                    log.error('{}'.format(matrixFileHandler_list[i]))
                    fail_flag = True
                queue[i] = None
                process[i].join()
                process[i].terminate()
                process[i] = None
                thread_done[i] = True
        all_data_collected = True
        for thread in thread_done:
            if not thread:
                all_data_collected = False
        time.sleep(1)

    if fail_flag:
        exit(1)
    matrix_file_handler_object_list = [item for sublist in matrixFileHandler_list for item in sublist]

    matrixFileHandler = MatrixFileHandler(pFileType='scool')
    matrixFileHandler.matrixFile.coolObjectsList = matrix_file_handler_object_list
    matrixFileHandler.save(args.outFileName, pSymmetric=True, pApplyCorrection=False)
示例#28
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    log.debug(args)

    # parse from hicpro, homer, h5 and hic to cool
    if args.inputFormat != 'hic' and args.outputFormat != 'mcool':
        if len(args.matrices) != len(args.outFileName):
            log.error(
                'Number of input matrices does not match number output matrices!'
            )
            exit(1)
    if args.inputFormat == 'hic' and args.outputFormat == 'cool':
        log.info('Converting with hic2cool.')
        for i, matrix in enumerate(args.matrices):
            if args.resolutions is None:
                hic2cool_convert(matrix, args.outFileName[i], 0)
            else:

                for resolution in args.resolutions:
                    out_name = args.outFileName[i].split('.')
                    out_name[-2] = out_name[-2] + '_' + str(resolution)
                    out_name = '.'.join(out_name)
                    hic2cool_convert(matrix, out_name, resolution)
        return
    elif args.inputFormat in ['hicpro', 'homer', 'h5', 'cool']:
        format_was_h5 = False
        if args.inputFormat == 'h5':
            format_was_h5 = True
        applyCorrection = True
        if args.store_applied_correction:
            applyCorrection = False
        if args.inputFormat == 'hicpro':
            if len(args.matrices) != len(args.bedFileHicpro):
                log.error(
                    'Number of matrices and associated bed files need to be the same.'
                )
                log.error('Matrices: {}; Bed files: {}'.format(
                    len(args.matrices), len(args.bedFileHicpro)))
                sys.exit(1)

        for i, matrix in enumerate(args.matrices):
            if args.inputFormat == 'hicpro':
                matrixFileHandlerInput = MatrixFileHandler(
                    pFileType=args.inputFormat,
                    pMatrixFile=matrix,
                    pBedFileHicPro=args.bedFileHicpro[i])
            else:
                correction_operator = None

                if args.correction_division:
                    correction_operator = '/'

                chromosomes_to_load = None
                if args.chromosome:
                    chromosomes_to_load = [args.chromosome]
                applyCorrectionCoolerLoad = True
                if args.load_raw_values:
                    applyCorrectionCoolerLoad = False
                matrixFileHandlerInput = MatrixFileHandler(
                    pFileType=args.inputFormat,
                    pMatrixFile=matrix,
                    pCorrectionFactorTable=args.correction_name,
                    pCorrectionOperator=correction_operator,
                    pChrnameList=chromosomes_to_load,
                    pEnforceInteger=args.enforce_integer,
                    pApplyCorrectionCoolerLoad=applyCorrectionCoolerLoad)

            _matrix, cut_intervals, nan_bins, \
                distance_counts, correction_factors = matrixFileHandlerInput.load()

            log.debug('Setting done')

            if args.outputFormat in ['cool', 'h5', 'homer', 'ginteractions']:
                if args.outputFormat in ['homer', 'ginteractions']:
                    # make it a upper triangular matrix in case it is not already
                    _matrix = triu(_matrix)
                    # make it a full symmetrical matrix
                    _matrix = _matrix.maximum(_matrix.T)
                hic2CoolVersion = None
                if args.inputFormat == 'cool':
                    hic2CoolVersion = matrixFileHandlerInput.matrixFile.hic2cool_version
                matrixFileHandlerOutput = MatrixFileHandler(
                    pFileType=args.outputFormat,
                    pEnforceInteger=args.enforce_integer,
                    pFileWasH5=format_was_h5,
                    pHic2CoolVersion=hic2CoolVersion)

                matrixFileHandlerOutput.set_matrix_variables(
                    _matrix, cut_intervals, nan_bins, correction_factors,
                    distance_counts)
                matrixFileHandlerOutput.save(args.outFileName[i],
                                             pSymmetric=True,
                                             pApplyCorrection=applyCorrection)
            elif args.outputFormat in ['mcool']:

                log.debug('outformat is mcool')
                if args.resolutions and len(args.matrices) > 1:
                    log.error(
                        'Please define one matrix and many resolutions which should be created or multiple matrices.'
                    )
                if args.resolutions:
                    log.info(
                        'Correction factors are removed. They are not valid for any new created resolution.'
                    )
                    hic_matrix = HiCMatrix.hiCMatrix()
                    hic_matrix.setMatrix(_matrix, cut_intervals)

                    bin_size = hic_matrix.getBinSize()

                    for j, resolution in enumerate(args.resolutions):
                        hic_matrix_res = deepcopy(hic_matrix)

                        _mergeFactor = int(resolution) // bin_size

                        log.debug('bin size {}'.format(bin_size))
                        log.debug('_mergeFactor {}'.format(_mergeFactor))
                        if int(resolution) != bin_size:
                            merged_matrix = hicMergeMatrixBins.merge_bins(
                                hic_matrix_res, _mergeFactor)
                        else:
                            merged_matrix = hic_matrix_res
                        append = False
                        if j > 0:
                            append = True
                        matrixFileHandlerOutput = MatrixFileHandler(
                            pFileType='cool',
                            pEnforceInteger=args.enforce_integer,
                            pAppend=append,
                            pFileWasH5=format_was_h5)

                        matrixFileHandlerOutput.set_matrix_variables(
                            merged_matrix.matrix, merged_matrix.cut_intervals,
                            merged_matrix.nan_bins,
                            merged_matrix.correction_factors,
                            merged_matrix.distance_counts)
                        matrixFileHandlerOutput.save(
                            args.outFileName[0] + '::/resolutions/' +
                            str(resolution),
                            pSymmetric=True,
                            pApplyCorrection=applyCorrection)

                else:
                    append = False
                    if i > 0:
                        append = True
                    hic_matrix = HiCMatrix.hiCMatrix()
                    hic_matrix.setMatrix(_matrix, cut_intervals)
                    bin_size = hic_matrix.getBinSize()
                    matrixFileHandlerOutput = MatrixFileHandler(
                        pFileType='cool',
                        pAppend=append,
                        pFileWasH5=format_was_h5)

                    matrixFileHandlerOutput.set_matrix_variables(
                        _matrix, cut_intervals, nan_bins, correction_factors,
                        distance_counts)
                    matrixFileHandlerOutput.save(
                        args.outFileName[0] + '::/resolutions/' +
                        str(bin_size),
                        pSymmetric=True,
                        pApplyCorrection=applyCorrection)
示例#29
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    log.debug(args)

    # parse from hicpro, homer, h5 and hic to cool
    if args.inputFormat != 'hic' and args.outputFormat != 'mcool':
        if len(args.matrices) != len(args.outFileName):
            log.error(
                'Number of input matrices does not match number output matrices!'
            )
            exit(1)
    if args.inputFormat == 'hic' and args.outputFormat == 'cool':
        log.info('Converting with hic2cool.')
        for i, matrix in enumerate(args.matrices):
            if args.resolutions is None:
                hic2cool_convert(matrix, args.outFileName[i], 0)
            else:
                out_name = args.outFileName[i].split('.')
                out_name[-2] = split_name[-2] + '_' + str(resolution)
                out_name = '.'.join(out_name)
                for resolution in args.resolutions:
                    hic2cool_convert(matrix, out_name, resolution)
        return
    elif args.inputFormat in ['hicpro', 'homer', 'h5', 'cool']:
        applyCorrection = True
        if args.store_applied_correction:
            applyCorrection = False
        if args.inputFormat == 'hicpro':
            if len(args.matrices) != len(args.bedFileHicpro):
                log.error(
                    'Number of matrices and associated bed files need to be the same.'
                )
                log.error('Matrices: {}; Bed files: {}'.format(
                    len(args.matrices), len(args.bedFileHicpro)))
                sys.exit(1)

        for i, matrix in enumerate(args.matrices):
            if args.inputFormat == 'hicpro':
                matrixFileHandlerInput = MatrixFileHandler(
                    pFileType=args.inputFormat,
                    pMatrixFile=matrix,
                    pBedFileHicPro=args.bedFileHicpro[i])
            else:
                correction_operator = None

                if args.correction_division:
                    correction_operator = '/'

                chromosomes_to_load = None
                if args.chromosome:
                    chromosomes_to_load = [args.chromosome]
                matrixFileHandlerInput = MatrixFileHandler(
                    pFileType=args.inputFormat,
                    pMatrixFile=matrix,
                    pCorrectionFactorTable=args.correction_name,
                    pCorrectionOperator=correction_operator,
                    pChrnameList=chromosomes_to_load,
                    pEnforceInteger=args.enforce_integer)

            _matrix, cut_intervals, nan_bins, \
                correction_factors, distance_counts = matrixFileHandlerInput.load()

            log.debug('Setting done')

            if args.outputFormat in ['cool', 'h5', 'homer', 'ginteractions']:
                matrixFileHandlerOutput = MatrixFileHandler(
                    pFileType=args.outputFormat)

                matrixFileHandlerOutput.set_matrix_variables(
                    _matrix, cut_intervals, nan_bins, correction_factors,
                    distance_counts)
                matrixFileHandlerOutput.save(args.outFileName[i] + '.' +
                                             args.outputFormat,
                                             pSymmetric=True,
                                             pApplyCorrection=applyCorrection)
            elif args.outputFormat in ['mcool']:

                log.debug('outformat is mcool')
                if args.resolutions and len(args.matrices) > 1:
                    log.error(
                        'Please define either one matrix and many resolutions which should be created.'
                    )
                if args.resolutions:
                    log.info(
                        'Correction factors are removed. They are not valid for any new created resolution'
                    )
                    hic_matrix = HiCMatrix.hiCMatrix()
                    hic_matrix.setMatrix(_matrix, cut_intervals)
                    bin_size = hic_matrix.getBinSize()

                    for resolution in args.resolutions:
                        _mergeFactor = int(resolution) // bin_size
                        merged_matrix = hicMergeMatrixBins.merge_bins(
                            hic_matrix, _mergeFactor)
                        matrixFileHandlerOutput = MatrixFileHandler(
                            pFileType='cool',
                            pEnforceInteger=args.enforce_integer)
                        matrixFileHandlerOutput.set_matrix_variables(
                            merged_matrix.matrix, merged_matrix.cut_intervals,
                            merged_matrix.nan_bins,
                            merged_matrix.correction_factors,
                            merged_matrix.distance_counts)
                        matrixFileHandlerOutput.save(
                            args.outFileName[0] + '.mcool' +
                            '::/resolutions/' + str(resolution),
                            pSymmetric=True,
                            pApplyCorrection=applyCorrection)

                else:
                    hic_matrix = HiCMatrix.hiCMatrix()
                    hic_matrix.setMatrix(_matrix, cut_intervals)
                    bin_size = hic_matrix.getBinSize()
                    matrixFileHandlerOutput = MatrixFileHandler(
                        pFileType='cool')
                    matrixFileHandlerOutput.set_matrix_variables(
                        _matrix, cut_intervals, nan_bins, correction_factors,
                        distance_counts)
                    matrixFileHandlerOutput.save(
                        args.outFileName[0] + '.mcool' + '::/resolutions/' +
                        str(bin_size),
                        pSymmetric=True,
                        pApplyCorrection=applyCorrection)
示例#30
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    log.debug(args)

    # parse from hicpro, homer, h5 and hic to cool
    if args.inputFormat != 'hic' and args.outputFormat != 'mcool':
        if len(args.matrices) != len(args.outFileName):
            log.error(
                'Number of input matrices does not match number output matrices!: Input matrices {}; output matrices {}'
                .format(len(args.matrices), len(args.outFileName)))
            exit(1)
    if args.inputFormat == 'hic' and args.outputFormat != 'cool':
        log.error('The export of a hic file is only possible to a cool file.')
        exit(1)
    if args.inputFormat == 'hic' and args.outputFormat == 'cool':
        log.info('Converting with hic2cool.')
        for i, matrix in enumerate(args.matrices):
            if args.resolutions is None:
                hic2cool_convert(matrix, args.outFileName[i], 0)
            else:

                for resolution in args.resolutions:
                    out_name = args.outFileName[i].split('.')
                    out_name[-2] = out_name[-2] + '_' + str(resolution)
                    out_name = '.'.join(out_name)
                    hic2cool_convert(matrix, out_name, resolution)
        return
    elif args.inputFormat in ['hicpro', 'homer', 'h5', 'cool', '2D-text']:
        format_was_h5 = False
        if args.inputFormat == 'h5':
            format_was_h5 = True
        applyCorrection = True
        if args.store_applied_correction:
            applyCorrection = False
        if args.inputFormat == 'hicpro':
            if len(args.matrices) != len(args.bedFileHicpro):
                log.error(
                    'Number of matrices and associated bed files need to be the same.'
                )
                log.error('Matrices: {}; Bed files: {}'.format(
                    len(args.matrices), len(args.bedFileHicpro)))
                sys.exit(1)

        if args.inputFormat == '2D-text':
            if args.resolutions is None:
                log.error('The resolution must be defined via --resolutions')
                sys.exit(1)
            if args.chromosomeSizes is None:
                log.error(
                    'The sizes of the chromosomes must be defined via --chromosomeSizes.'
                )
                sys.exit(1)

        for i, matrix in enumerate(args.matrices):
            if args.inputFormat == 'hicpro':
                matrixFileHandlerInput = MatrixFileHandler(
                    pFileType=args.inputFormat,
                    pMatrixFile=matrix,
                    pBedFileHicPro=args.bedFileHicpro[i])
                _matrix, cut_intervals, nan_bins, \
                    distance_counts, correction_factors = matrixFileHandlerInput.load()
            elif args.inputFormat == '2D-text':
                chrom_sizes = OrderedDict()
                size_genome = 0
                with open(args.chromosomeSizes.name, 'r') as file:
                    file_ = True
                    while file_:
                        file_ = file.readline().strip()
                        if file_ != '':
                            line_split = file_.split('\t')
                            chrom_sizes[line_split[0]] = int(line_split[1])
                            size_genome += int(line_split[1])
                chrom_sizes = list(chrom_sizes.items())

                # log.debug('chrom_sizes: {}'.format(chrom_sizes))
                args.resolutions = [int(x) for x in args.resolutions]
                # internal_matrix_size = size_genome // args.resolutions[0]

                cut_intervals = []
                for chromosome in chrom_sizes:
                    for interval in range(0, chromosome[1],
                                          args.resolutions[0]):
                        cut_intervals.append(
                            tuple([
                                chromosome[0], interval,
                                min(chromosome[1],
                                    interval + args.resolutions[0]), 1.0
                            ]))

                hic_matrix_csr = lil_matrix(
                    (len(cut_intervals), len(cut_intervals)))
                log.debug('cut_intervals {}'.format(cut_intervals[:20]))

                hic_matrix = HiCMatrix.hiCMatrix()
                hic_matrix.setMatrix(hic_matrix_csr, cut_intervals)
                # tmp_matrix = coo_matrix(())
                with open(matrix, 'r') as file:
                    for j, line in enumerate(file):
                        line_split = line.split('\t')
                        chromosome_1 = str(line_split[0])
                        start_1 = int(line_split[1])
                        end_1 = int(line_split[2])

                        chromosome_2 = str(line_split[3])
                        start_2 = int(line_split[4])
                        end_2 = int(line_split[5])

                        value = float(line_split[6])
                        bin_id_1 = hic_matrix.getRegionBinRange(
                            chromosome_1, start_1, end_1)
                        bin_id_2 = hic_matrix.getRegionBinRange(
                            chromosome_2, start_2, end_2)
                        try:
                            hic_matrix.matrix[bin_id_1, bin_id_2] = value
                        except Exception as exp:
                            log.debug(str(exp))
                        if j % 1000 == 0:
                            log.debug('{} lines computed'.format(j))
                log.debug('csr with values filled!')
                hic_matrix.matrix = hic_matrix.matrix.tocsr()

                _matrix, cut_intervals, nan_bins, \
                    distance_counts, correction_factors = hic_matrix.matrix, hic_matrix.cut_intervals, hic_matrix.nan_bins, \
                    hic_matrix.distance_counts, hic_matrix.correction_factors

            else:
                correction_operator = None

                if args.correction_division:
                    correction_operator = '/'

                chromosomes_to_load = None
                if args.chromosome:
                    chromosomes_to_load = [args.chromosome]
                applyCorrectionCoolerLoad = True
                if args.load_raw_values:
                    applyCorrectionCoolerLoad = False
                matrixFileHandlerInput = MatrixFileHandler(
                    pFileType=args.inputFormat,
                    pMatrixFile=matrix,
                    pCorrectionFactorTable=args.correction_name,
                    pCorrectionOperator=correction_operator,
                    pChrnameList=chromosomes_to_load,
                    pEnforceInteger=args.enforce_integer,
                    pApplyCorrectionCoolerLoad=applyCorrectionCoolerLoad)

                _matrix, cut_intervals, nan_bins, \
                    distance_counts, correction_factors = matrixFileHandlerInput.load()

            log.debug('cut_intervals {}'.format(cut_intervals[:20]))

            log.debug('Setting done')

            if args.outputFormat in ['cool', 'h5', 'homer', 'ginteractions']:
                log.debug('cool h5 homer ginteractions hicpro branch')

                if args.outputFormat in ['homer', 'ginteractions']:
                    log.debug('homer ginteractions branch')

                    # make it a upper triangular matrix in case it is not already
                    _matrix = triu(_matrix)
                    # make it a full symmetrical matrix
                    _matrix = _matrix.maximum(_matrix.T)
                hic2CoolVersion = None
                cool_metadata = None
                if args.inputFormat == 'cool':
                    hic2CoolVersion = matrixFileHandlerInput.matrixFile.hic2cool_version
                    cool_metadata = matrixFileHandlerInput.matrixFile.hic_metadata

                log.debug('cool_metadata {}'.format(cool_metadata))
                matrixFileHandlerOutput = MatrixFileHandler(
                    pFileType=args.outputFormat,
                    pEnforceInteger=args.enforce_integer,
                    pFileWasH5=format_was_h5,
                    pHic2CoolVersion=hic2CoolVersion,
                    pHiCInfo=cool_metadata)

                matrixFileHandlerOutput.set_matrix_variables(
                    _matrix, cut_intervals, nan_bins, correction_factors,
                    distance_counts)
                log.debug('len(args.outFileName) {}, i {}'.format(
                    len(args.outFileName), i))
                matrixFileHandlerOutput.save(args.outFileName[i],
                                             pSymmetric=True,
                                             pApplyCorrection=applyCorrection)

            if args.outputFormat == 'hicpro':
                log.debug('hicpro branch')
                if len(args.matrices) == len(args.outFileName) and len(
                        args.outFileName) == len(args.bedFileHicpro):
                    log.debug('args.bedFileHicpro[i] {}'.format(
                        args.bedFileHicpro[i]))
                    matrixFileHandlerOutput = MatrixFileHandler(
                        pFileType=args.outputFormat,
                        pBedFileHicPro=args.bedFileHicpro[i])

                    matrixFileHandlerOutput.set_matrix_variables(
                        _matrix, cut_intervals, nan_bins, correction_factors,
                        distance_counts)
                    matrixFileHandlerOutput.save(
                        args.outFileName[i],
                        pSymmetric=True,
                        pApplyCorrection=applyCorrection)
                else:
                    log.error(
                        'The number of input matrices, output files and bed files does not match: Input: {}; Output: {}; Bed: {}'
                        .format(len(args.matrix), len(args.outFileName),
                                len(args.bedFileHicpro)))
                    exit(1)
            elif args.outputFormat in ['mcool']:

                log.debug('outformat is mcool')
                if args.resolutions and len(args.matrices) > 1:
                    log.error(
                        'Please define one matrix and many resolutions which should be created or multiple matrices.'
                    )
                if args.resolutions:
                    log.info(
                        'Correction factors are removed. They are not valid for any new created resolution.'
                    )
                    hic_matrix = HiCMatrix.hiCMatrix()
                    hic_matrix.setMatrix(_matrix, cut_intervals)

                    bin_size = hic_matrix.getBinSize()
                    hic2CoolVersion = None
                    cool_metadata = None
                    if args.inputFormat == 'cool':
                        hic2CoolVersion = matrixFileHandlerInput.matrixFile.hic2cool_version
                        cool_metadata = matrixFileHandlerInput.matrixFile.hic_metadata
                    for j, resolution in enumerate(args.resolutions):
                        hic_matrix_res = deepcopy(hic_matrix)

                        _mergeFactor = int(resolution) // bin_size

                        log.debug('bin size {}'.format(bin_size))
                        log.debug('_mergeFactor {}'.format(_mergeFactor))
                        if int(resolution) != bin_size:
                            merged_matrix = hicMergeMatrixBins.merge_bins(
                                hic_matrix_res, _mergeFactor)
                        else:
                            merged_matrix = hic_matrix_res
                        append = False
                        if j > 0:
                            append = True
                        matrixFileHandlerOutput = MatrixFileHandler(
                            pFileType='cool',
                            pEnforceInteger=args.enforce_integer,
                            pAppend=append,
                            pFileWasH5=format_was_h5,
                            pHic2CoolVersion=hic2CoolVersion,
                            pHiCInfo=cool_metadata)

                        matrixFileHandlerOutput.set_matrix_variables(
                            merged_matrix.matrix, merged_matrix.cut_intervals,
                            merged_matrix.nan_bins,
                            merged_matrix.correction_factors,
                            merged_matrix.distance_counts)
                        matrixFileHandlerOutput.save(
                            args.outFileName[0] + '::/resolutions/' +
                            str(resolution),
                            pSymmetric=True,
                            pApplyCorrection=applyCorrection)

                else:
                    append = False
                    if i > 0:
                        append = True
                    hic_matrix = HiCMatrix.hiCMatrix()
                    hic_matrix.setMatrix(_matrix, cut_intervals)
                    bin_size = hic_matrix.getBinSize()
                    matrixFileHandlerOutput = MatrixFileHandler(
                        pFileType='cool',
                        pAppend=append,
                        pFileWasH5=format_was_h5)

                    matrixFileHandlerOutput.set_matrix_variables(
                        _matrix, cut_intervals, nan_bins, correction_factors,
                        distance_counts)
                    matrixFileHandlerOutput.save(
                        args.outFileName[0] + '::/resolutions/' +
                        str(bin_size),
                        pSymmetric=True,
                        pApplyCorrection=applyCorrection)