示例#1
0
def _obs_exp_lieberman(pSubmatrix, pLengthChromosome, pChromosomeCount):

    obs_exp_matrix_ = obs_exp_matrix_lieberman(pSubmatrix, pLengthChromosome,
                                               pChromosomeCount)
    obs_exp_matrix_ = convertNansToZeros(csr_matrix(obs_exp_matrix_))
    obs_exp_matrix_ = convertInfsToZeros(csr_matrix(obs_exp_matrix_)).todense()
    return obs_exp_matrix_
示例#2
0
def _obs_exp_lieberman(pSubmatrix, pLengthChromosome, pChromosomeCount):

    obs_exp_matrix_ = obs_exp_matrix_lieberman(pSubmatrix, pLengthChromosome, pChromosomeCount)
    obs_exp_matrix_ = convertNansToZeros(csr_matrix(obs_exp_matrix_))
    obs_exp_matrix_ = convertInfsToZeros(csr_matrix(obs_exp_matrix_))
    # if len(obs_exp_matrix_.data) == 0:
    #     return np.array()
    return obs_exp_matrix_  # .todense()
示例#3
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    if int(args.numberOfEigenvectors) != len(args.outputFileName):
        log.error("Number of output file names and number of eigenvectors"
                  " does not match. Please"
                  "provide the name of each file.\nFiles: {}\nNumber of "
                  "eigenvectors: {}".format(args.outputFileName,
                                            args.numberOfEigenvectors))
        exit(1)

    ma = hm.hiCMatrix(args.matrix)
    ma.maskBins(ma.nan_bins)

    if args.ignoreMaskedBins:
        # ma.maskBins(ma.nan_bins)
        new_intervals = enlarge_bins(ma.cut_intervals)
        ma.setCutIntervals(new_intervals)

    if args.chromosomes:
        ma.keepOnlyTheseChr(args.chromosomes)

    vecs_list = []
    chrom_list = []
    start_list = []
    end_list = []
    # PCA is computed per chromosome
    length_chromosome = 0
    chromosome_count = len(ma.getChrNames())
    if args.pearsonMatrix:
        transf_matrix_pearson = lil_matrix(ma.matrix.shape)

    if args.obsexpMatrix:
        transf_matrix_obsexp = lil_matrix(ma.matrix.shape)

    for chrname in ma.getChrNames():
        chr_range = ma.getChrBinRange(chrname)
        length_chromosome += chr_range[1] - chr_range[0]
    if args.extraTrack and (args.extraTrack.endswith('.bw') or args.extraTrack.endswith('.bigwig')):
        bwTrack = pyBigWig.open(args.extraTrack, 'r')
    for chrname in ma.getChrNames():
        chr_range = ma.getChrBinRange(chrname)

        submatrix = ma.matrix[chr_range[0]:chr_range[1],
                              chr_range[0]:chr_range[1]]
        if args.method == 'lieberman':
            obs_exp_matrix_ = obs_exp_matrix_lieberman(submatrix,
                                                       length_chromosome,
                                                       chromosome_count)
        else:
            obs_exp_matrix_ = obs_exp_matrix_non_zero(submatrix, args.ligation_factor)

        obs_exp_matrix_ = csr_matrix(obs_exp_matrix_).todense()
        if args.obsexpMatrix:
            transf_matrix_obsexp[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] = lil_matrix(obs_exp_matrix_)
        pearson_correlation_matrix = np.corrcoef(obs_exp_matrix_)
        pearson_correlation_matrix = convertNansToZeros(csr_matrix(pearson_correlation_matrix)).todense()
        pearson_correlation_matrix = convertInfsToZeros(csr_matrix(pearson_correlation_matrix)).todense()

        if args.pearsonMatrix:
            transf_matrix_pearson[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] = lil_matrix(pearson_correlation_matrix)

        corrmatrix = np.cov(pearson_correlation_matrix)
        corrmatrix = pearson_correlation_matrix
        corrmatrix = convertNansToZeros(csr_matrix(corrmatrix)).todense()
        corrmatrix = convertInfsToZeros(csr_matrix(corrmatrix)).todense()
        evals, eigs = linalg.eig(corrmatrix)
        k = args.numberOfEigenvectors

        chrom, start, end, _ = zip(*ma.cut_intervals[chr_range[0]:chr_range[1]])

        chrom_list += chrom
        start_list += start
        end_list += end
        if args.extraTrack and (args.extraTrack.endswith('.bw') or args.extraTrack.endswith('.bigwig')):
            assert(len(end) == len(start))
            correlateEigenvectorWithHistonMarkTrack(eigs[:, :k].transpose(),
                                                    bwTrack, chrname, start,
                                                    end, args.extraTrack,
                                                    args.histonMarkType)

        vecs_list += eigs[:, :k].tolist()

    if args.pearsonMatrix:
        file_type = 'cool'
        if args.pearsonMatrix.endswith('.h5'):
            file_type = 'h5'
        matrixFileHandlerOutput = MatrixFileHandler(pFileType=file_type)
        matrixFileHandlerOutput.set_matrix_variables(transf_matrix_pearson.tocsr(),
                                                     ma.cut_intervals,
                                                     ma.nan_bins,
                                                     ma.correction_factors,
                                                     ma.distance_counts)
        matrixFileHandlerOutput.save(args.pearsonMatrix, pSymmetric=True,
                                     pApplyCorrection=False)

    if args.obsexpMatrix:
        file_type = 'cool'
        if args.obsexpMatrix.endswith('.h5'):
            file_type = 'h5'
        matrixFileHandlerOutput = MatrixFileHandler(pFileType=file_type)
        matrixFileHandlerOutput.set_matrix_variables(transf_matrix_obsexp.tocsr(),
                                                     ma.cut_intervals,
                                                     ma.nan_bins,
                                                     ma.correction_factors,
                                                     ma.distance_counts)
        matrixFileHandlerOutput.save(args.obsexpMatrix, pSymmetric=True,
                                     pApplyCorrection=False)

    if args.extraTrack and not args.extraTrack.endswith('.bw') and not args.extraTrack.endswith('.bigwig'):
        vecs_list = correlateEigenvectorWithGeneTrack(ma, vecs_list, args.extraTrack)

    if args.format == 'bedgraph':
        for idx, outfile in enumerate(args.outputFileName):
            assert(len(vecs_list) == len(chrom_list))

            with open(outfile, 'w') as fh:
                for i, value in enumerate(vecs_list):
                    if len(value) == args.numberOfEigenvectors:
                        if isinstance(value[idx], np.complex):
                            value[idx] = value[idx].real
                        fh.write("{}\t{}\t{}\t{:.12f}\n".format(toString(chrom_list[i]), start_list[i], end_list[i], value[idx]))

    elif args.format == 'bigwig':
        if not pyBigWig.numpy == 1:
            log.error("ERROR: Your version of pyBigWig is not supporting "
                      "numpy: {}".format(pyBigWig.__file__))
            exit(1)
        old_chrom = chrom_list[0]
        header = []
        for i, _chrom in enumerate(chrom_list):
            if old_chrom != _chrom:
                header.append((toString(old_chrom), end_list[i - 1]))
            old_chrom = _chrom

        header.append((toString(chrom_list[-1]), end_list[-1]))
        for idx, outfile in enumerate(args.outputFileName):
            log.debug("bigwig: len(vecs_list) {}".format(len(vecs_list)))
            log.debug("bigwig: len(chrom_list) {}".format(len(chrom_list)))

            assert(len(vecs_list) == len(chrom_list))
            _chrom_list = []
            _start_list = []
            _end_list = []
            values = []

            bw = pyBigWig.open(outfile, 'w')
            # set big wig header
            bw.addHeader(header)
            # create entry lists
            for i, value in enumerate(vecs_list):
                # it can happen that some 'value' is having less dimensions than it should
                if len(value) == args.numberOfEigenvectors:
                    if isinstance(value[idx], np.complex):
                        value[idx] = value[idx].real
                    values.append(value[idx])
                    _chrom_list.append(toString(chrom_list[i]))
                    _start_list.append(start_list[i])
                    _end_list.append(end_list[i])

            # write entries
            bw.addEntries(_chrom_list, _start_list, ends=_end_list,
                          values=values)
            bw.close()
    else:
        log.error("Output format not known: {}".format(args.format))
        exit(1)
def open_and_store_matrix(pMatrixName, pMatricesList, pIndex, pXDimension,
                          pChromosomes, pNorm, pExtraTrack, pHistonMarkType,
                          pBinarization, pQueue):
    compartments_matrix = None

    for i, matrix in enumerate(pMatricesList):

        ma = hm.hiCMatrix(pMatrixName + '::' + matrix)

        # WARNING
        # DO NOT APPLY BIN MASKING, WILL LEAD TO DIFFERENT SIZES OF THE CHROMOSOMES
        # THIS IS CAUSING A FAIL OF THE COMPUTATION
        # ma.maskBins(ma.nan_bins)
        k = 1
        if pChromosomes:
            ma.keepOnlyTheseChr(pChromosomes)

        vecs_list = []
        chrom_list = []
        start_list = []
        end_list = []
        # PCA is computed per chromosome
        length_chromosome = 0
        chromosome_count = len(ma.getChrNames())

        for chrname in ma.getChrNames():
            chr_range = ma.getChrBinRange(chrname)
            length_chromosome += chr_range[1] - chr_range[0]

        if pExtraTrack and (pExtraTrack.endswith('.bw')
                            or pExtraTrack.endswith('.bigwig')):
            bwTrack = pyBigWig.open(pExtraTrack, 'r')

        for chrname in ma.getChrNames():
            chr_range = ma.getChrBinRange(chrname)
            submatrix = ma.matrix[chr_range[0]:chr_range[1],
                                  chr_range[0]:chr_range[1]]
            if pNorm:
                obs_exp_matrix_ = obs_exp_matrix_norm(submatrix)

            else:
                obs_exp_matrix_ = obs_exp_matrix_lieberman(
                    submatrix, length_chromosome, chromosome_count)
            obs_exp_matrix_ = convertNansToZeros(
                csr_matrix(obs_exp_matrix_)).todense()
            obs_exp_matrix_ = convertInfsToZeros(
                csr_matrix(obs_exp_matrix_)).todense()

            pearson_correlation_matrix = np.corrcoef(obs_exp_matrix_)
            pearson_correlation_matrix = convertNansToZeros(
                csr_matrix(pearson_correlation_matrix)).todense()
            pearson_correlation_matrix = convertInfsToZeros(
                csr_matrix(pearson_correlation_matrix)).todense()

            corrmatrix = np.cov(pearson_correlation_matrix)
            corrmatrix = convertNansToZeros(csr_matrix(corrmatrix)).todense()
            corrmatrix = convertInfsToZeros(csr_matrix(corrmatrix)).todense()
            evals, eigs = linalg.eig(corrmatrix)

            chrom, start, end, _ = zip(
                *ma.cut_intervals[chr_range[0]:chr_range[1]])

            chrom_list += chrom
            start_list += start
            end_list += end
            if pExtraTrack and (pExtraTrack.endswith('.bw')
                                or pExtraTrack.endswith('.bigwig')):
                assert (len(end) == len(start))
                correlateEigenvectorWithHistonMarkTrack(
                    eigs[:, :k].transpose(), bwTrack, chrname, start, end,
                    pExtraTrack, pHistonMarkType)

            vecs_list += eigs[:, :k].tolist()
        if compartments_matrix is None:
            compartments_matrix = np.zeros(
                [pXDimension, len(np.array(vecs_list).flatten())],
                dtype=np.float)

        eigenvector = np.real(np.array(vecs_list).flatten())
        mask = np.isnan(eigenvector)
        if len(mask) > 0:
            eigenvector[mask] = 0
        mask = np.isinf(eigenvector)
        if len(mask) > 0:
            eigenvector[mask] = 0

        if pBinarization:
            mask = eigenvector <= 0
            eigenvector[mask] = -1
            mask = eigenvector > 0
            eigenvector[mask] = 1

        compartments_matrix[pIndex + i, :] = eigenvector

    pQueue.put(compartments_matrix)

    return