Python toString 예제들, hicexplorer.utilities.toString Python 예제들

예제 #1

0

파일 보기

def main():

    args = parse_arguments().parse_args()
    for matrix in args.matrices:
        # if
        hic_ma = hm.hiCMatrix(matrix)
        size = hic_ma.matrix.shape[0]
        num_non_zero = hic_ma.matrix.nnz
        sum_elements = hic_ma.matrix.sum() / 2
        bin_length = hic_ma.getBinSize()
        num_nan_bins = len(hic_ma.nan_bins)
        min_non_zero = hic_ma.matrix.data.min()
        max_non_zero = hic_ma.matrix.data.max()

        chromosomes = list(hic_ma.chrBinBoundaries)

        if args.outFileName:
            with open(args.outFileName, 'w') as file:
                file.write(
                    "# Matrix information file. Created with HiCExplorer's hicInfo version {}\n"
                    .format(__version__))
                file.write("File:\t{}\n".format(matrix))
                file.write("Size:\t{:,}\n".format(size))
                file.write("Sum:\t{:,}\n".format(sum_elements))
                file.write("Bin_length:\t{}\n".format(bin_length))
                file.write("Chromosomes:\t{}\n".format(", ".join(
                    toString(chromosomes))))
                file.write("Non-zero elements:\t{:,}\n".format(num_non_zero))
                file.write("Minimum (non zero):\t{}\n".format(min_non_zero))
                file.write("Maximum:\t{}\n".format(max_non_zero))
                file.write("NaN bins:\t{}\n".format(num_nan_bins))
                if check_cooler(matrix):
                    file.write(
                        'The following columns are available: {}'.format(
                            hic_ma.getInformationCoolerBinNames()))
        else:
            print("File:\t{}".format(matrix))
            print("Size:\t{:,}".format(size))
            print("Sum:\t{:,}".format(sum_elements))
            print("Bin_length:\t{}".format(bin_length))
            print("Chromosomes:\t{}".format(", ".join(toString(chromosomes))))
            print("Non-zero elements:\t{:,}".format(num_non_zero))
            print("Minimum (non zero):\t{}".format(min_non_zero))
            print("Maximum:\t{}".format(max_non_zero))
            print("NaN bins:\t{}".format(num_nan_bins))
            if check_cooler(matrix):
                print('The following columns are available: {}'.format(
                    hic_ma.getInformationCoolerBinNames()))

예제 #2

0

파일 보기

파일: hicPlotMatrix.py 프로젝트: ryys1122/HiCExplorer

def plotLongRangeContacts(pAxis, pNameOfLongRangeContactsFile, pHiCMatrix, pRegion, pChromosomeOrder):

    x_list = []
    y_list = []
    log.debug('pRegion {}'.format(pRegion))
    with open(pNameOfLongRangeContactsFile, 'rb') as file:
        for line in file.readlines():
            line = toString(line)
            fields = line.strip().split('\t')
            try:
                chrom_X, start_X, end_X = fields[0:3]
                chrom_Y, start_Y, end_Y = fields[3:6]

                if pRegion is not None and (chrom_X != pRegion[0] or chrom_Y != pRegion[0]):
                    continue
                elif pChromosomeOrder is not None and (chrom_X not in pChromosomeOrder or chrom_Y not in pChromosomeOrder):
                    continue

                x = int(start_X)
                y = int(start_Y)
                log.debug('x {} y {}'.format(x, y))
                if x >= int(pRegion[1]) and x <= int(pRegion[2]):
                    if y >= int(pRegion[1]) and y <= int(pRegion[2]):
                        x_list.append(x)
                        y_list.append(y)
            except Exception:
                pass

        if pRegion is not None and (int(pRegion[1]) != 0 and int(pRegion[2]) != 1e15):
            pAxis.set_xlim(int(pRegion[1]), int(pRegion[2]))
            pAxis.set_ylim(int(pRegion[1]), int(pRegion[2]))

        pAxis.plot(x_list, y_list, 's', lw=2,
                   markerfacecolor='none', markeredgecolor='red')

예제 #3

0

파일 보기

파일: hicInfo.py 프로젝트: zhixuqiu/HiCExplorer

def main():

    args = parse_arguments().parse_args()
    for matrix in args.matrices:

        hic_ma = hm.hiCMatrix(matrix)
        size = hic_ma.matrix.shape[0]
        num_non_zero = hic_ma.matrix.nnz
        sum_elements = hic_ma.matrix.sum() / 2
        bin_length = hic_ma.getBinSize()
        num_nan_bins = len(hic_ma.nan_bins)
        min_non_zero = hic_ma.matrix.data.min()
        max_non_zero = hic_ma.matrix.data.max()
        if not matrix.endswith("lieberman"):
            log.debug("lieberman matrix")
            chromosomes = list(hic_ma.chrBinBoundaries)

        print("File:\t{}".format(matrix))
        print("Size:\t{:,}".format(size))
        print("Sum:\t{:,}".format(sum_elements))
        print("Bin_length:\t{}".format(bin_length))
        print("Chromosomes:\t{}".format(", ".join(toString(chromosomes))))
        print("Non-zero elements:\t{:,}".format(num_non_zero))
        print("Minimum (non zero):\t{}".format(min_non_zero))
        print("Maximum:\t{}".format(max_non_zero))
        print("NaN bins:\t{}".format(num_nan_bins))

예제 #4

0

파일 보기

파일: hicInfo.py 프로젝트: Rungetf/HiCExplorer

def main():

    args = parse_arguments().parse_args()
    for matrix in args.matrices:
        # if
        hic_ma = hm.hiCMatrix(matrix)
        size = hic_ma.matrix.shape[0]
        num_non_zero = hic_ma.matrix.nnz
        sum_elements = hic_ma.matrix.sum() / 2
        bin_length = hic_ma.getBinSize()
        num_nan_bins = len(hic_ma.nan_bins)
        min_non_zero = hic_ma.matrix.data.min()
        max_non_zero = hic_ma.matrix.data.max()

        chromosomes = list(hic_ma.chrBinBoundaries)

        print("File:\t{}".format(matrix))
        print("Size:\t{:,}".format(size))
        print("Sum:\t{:,}".format(sum_elements))
        print("Bin_length:\t{}".format(bin_length))
        print("Chromosomes:\t{}".format(", ".join(toString(chromosomes))))
        print("Non-zero elements:\t{:,}".format(num_non_zero))
        print("Minimum (non zero):\t{}".format(min_non_zero))
        print("Maximum:\t{}".format(max_non_zero))
        print("NaN bins:\t{}".format(num_nan_bins))
        if check_cooler(matrix):
            hic_ma.getInformationCoolerBinNames()

예제 #5

0

파일 보기

파일: readBed.py 프로젝트: wangyibin/HiCExplorer

    def get_no_comment_line(self):
        """
        Skips comment lines starting with '#'
        "track" or "browser" in the bed files
        :return:
        """
        line = next(self.file_handle)
        line = toString(line)
        if line.startswith("#") or line.startswith("track") or \
                line.startswith("browser") or line.strip() == '':
            line = self.get_no_comment_line()

        self.line_number += 1
        return line

예제 #6

0

파일 보기

def change_chrom_names(chrom):
    """
    Changes UCSC chromosome names to ensembl chromosome names
    and vice versa.
    """
    # TODO: mapping from chromosome names like mithocondria is missing
    chrom = toString(chrom)
    if chrom.startswith('chr'):
        # remove the chr part from chromosome name
        chrom = chrom[3:]
    else:
        # prefix with 'chr' the chromosome name
        chrom = 'chr' + chrom

    return chrom

예제 #7

0

파일 보기

파일: hicMergeTADbins.py 프로젝트: zhixuqiu/HiCExplorer

def get_boundary_bin_id(hic, bed_fh):
    """
    :param hic: HiCMatrix object
    :param bed_fh: file handle of the bed file
    :return: Sorted list of bin indices.
    """
    line_number = 0
    boundaries = set()
    for line in bed_fh.readlines():
        line_number += 1
        line = toString(line)
        if line.startswith('browser') or line.startswith(
                'track') or line.startswith('#'):
            continue
        try:
            chrom, start, end = line.strip().split('\t')[0:3]
        except Exception as detail:
            msg = 'Could not read line\n{}\n. {}'.format(line, detail)
            log.exception(msg)
            sys.exit()

        try:
            start = int(start)
            end = int(end)
        except ValueError as detail:
            msg = "Error reading line: {}. One of the fields is not " \
                  "an integer.\nError message: {}".format(line_number, detail)
            log.exception(msg)
            sys.exit()

        assert start <= end, "Error in line #{}, end1 larger than start1 in {}".format(
            line_number, line)

        # check the overlap of the region with the hic matrix bins
        start_bin, end_bin = hic.getRegionBinRange(chrom, start, end)
        boundaries.add(start_bin)
        boundaries.add(end_bin)

    return np.sort(list(boundaries))

예제 #8

0

파일 보기

파일: hicPlotMatrix.py 프로젝트: simonbray/HiCExplorer

def plotLongRangeContacts(pAxis, pNameOfLongRangeContactsFile, pHiCMatrix, pRegion):

    x_list = []
    y_list = []
    with open(pNameOfLongRangeContactsFile, 'rb') as file:
        for line in file.readlines():
            line = toString(line)
            fields = line.strip().split('\t')
            try:
                chrom_X, start_X, end_X = fields[0:3]
                chrom_Y, start_Y, end_Y = fields[3:6]
                if chrom_X != pRegion[0] or chrom_Y != pRegion[0]:
                    continue
                x = int(start_X)
                y = int(start_Y)

                x_list.append(x)
                y_list.append(y)
            except Exception:
                pass
        pAxis.set_xlim(int(pRegion[1]), int(pRegion[2]))
        pAxis.set_ylim(int(pRegion[1]), int(pRegion[2]))

        pAxis.plot(x_list, y_list, 's', lw=2, markerfacecolor='none', markeredgecolor='red')

예제 #9

0

파일 보기

파일: readBed.py 프로젝트: wangyibin/HiCExplorer

    def __init__(self, file_handle):
        """
        :param file_handle: file handle
        :return:
        """

        self.file_type = None
        self.file_handle = file_handle
        self.line_number = 0
        # guess file type
        fields = self.get_no_comment_line()
        fields = toString(fields)
        fields = fields.split('\t')

        self.guess_file_type(fields)
        self.file_handle.seek(0)
        self.prev_chrom = None
        self.prev_start = -1
        self.prev_line = None

        # list of bed fields
        self.fields = [
            'chromosome', 'start', 'end', 'name', 'score', 'strand',
            'thick_start', 'thick_end', 'rgb', 'block_count', 'block_sizes',
            'block_starts'
        ]

        if self.file_type == 'bed12':
            self.BedInterval = collections.namedtuple('BedInterval',
                                                      self.fields)
        elif self.file_type == 'bed9':
            self.BedInterval = collections.namedtuple('BedInterval',
                                                      self.fields[:9])
        else:
            self.BedInterval = collections.namedtuple('BedInterval',
                                                      self.fields[:6])

예제 #10

0

파일 보기

def plotEigenvector(pAxis,
                    pNameOfEigenvectorsList,
                    pChromosomeList=None,
                    pRegion=None,
                    pXticks=None):
    log.debug('plotting eigenvector')
    pAxis.set_frame_on(False)

    file_format = pNameOfEigenvectorsList[0].split(".")[-1]
    if file_format != 'bedgraph' and file_format != 'bigwig' and file_format != 'bw':

        log.error("Given eigenvector files are not bedgraph or bigwig")
        exit()

    for eigenvector in pNameOfEigenvectorsList:
        if eigenvector.split('.')[-1] != file_format:
            log.error("Eigenvector input files have different formats.")
            exit()

    if pRegion:
        chrom, region_start, region_end = pRegion
    x = None
    eigenvector = None
    if file_format == "bigwig" or file_format == 'bw':
        for i, eigenvectorFile in enumerate(pNameOfEigenvectorsList):
            bw = pyBigWig.open(eigenvectorFile)
            eigenvector = []
            if pChromosomeList:
                for chrom in pChromosomeList:
                    try:
                        bins_list = bw.intervals(toString(chrom))
                    except Exception:
                        log.info(
                            "Chromosome with no entry in the eigenvector found. Please exclude it from the matrix: {}. The eigenvector is left empty."
                            .format(chrom))
                        return
                    if bins_list is None:
                        log.info(
                            "Chromosome with no entry in the eigenvector found. Please exclude it from the matrix: {}. The eigenvector is left empty."
                            .format(chrom))
                        return
                    for i, bin_ in enumerate(bins_list):
                        if i == 0:
                            region_start = bin_[0]
                        eigenvector.append(complex(bin_[2]).real)
                    region_end = bins_list[-1][1]

                x = np.arange(0, len(eigenvector), 1)
                pAxis.set_xlim(0, len(eigenvector))

            elif pRegion:
                try:
                    if region_start == 0 and region_end == 1e15:
                        log.debug("chrom == pRegion")
                        bins_list = bw.intervals(toString(chrom))
                        region_start = bins_list[0][0]
                        region_end = bins_list[-1][1]
                    else:
                        log.debug(
                            "chrom: {}, region_start: {}, region_end: {}".
                            format(chrom, region_start, region_end))
                        log.debug("pRegion: {}".format(pRegion))
                        bins_list = bw.intervals(chrom, region_start,
                                                 region_end)
                except Exception:
                    log.info(
                        "Chromosome with no entry in the eigenvector found. Please exclude it from the matrix: {}. The eigenvector is left empty."
                        .format(chrom))
                    return
                if bins_list is None:
                    log.info(
                        "Chromosome with no entry in the eigenvector found. Please exclude it from the matrix: {}. The eigenvector is left empty."
                        .format(chrom))
                    return
                for bin_ in bins_list:
                    eigenvector.append(complex(bin_[2]).real)
                step = (region_end * 2 - region_start) // len(eigenvector)

                x = np.arange(region_start, region_end * 2, int(step))
                while len(x) < len(eigenvector):
                    x = np.append(x[-1] + int(step))
                while len(eigenvector) < len(x):
                    x = x[:-1]

                pAxis.set_xlim(region_start, region_end * 2)

    else:
        for i, eigenvectorFile in enumerate(pNameOfEigenvectorsList):
            interval_tree, min_value, max_value = file_to_intervaltree(
                eigenvectorFile)
            eigenvector = []
            if pChromosomeList:
                for chrom in pChromosomeList:
                    if toString(chrom) not in interval_tree:
                        log.info(
                            "Chromosome with no entry in the eigenvector found. Please exclude it from the matrix: {}. The eigenvector is left empty."
                            .format(chrom))
                        return
                    for i, region in enumerate(
                            sorted(interval_tree[toString(chrom)])):
                        if i == 0:
                            region_start = region[0]
                        region_end = region[1]
                        eigenvector.append(complex(region.data[0]).real)
                x = np.arange(0, len(eigenvector), 1)
                pAxis.set_xlim(0, len(eigenvector))

            elif pRegion:
                if toString(chrom) not in interval_tree:
                    log.info(
                        "Chromosome with no entry in the eigenvector found. Please exclude it from the matrix: {}. The eigenvector is left empty."
                        .format(chrom))
                    return
                for region in sorted(interval_tree[toString(chrom)]
                                     [region_start:region_end]):
                    eigenvector.append(float(region.data[0]))
                step = (region_end * 2 - region_start) // len(eigenvector)

                x = np.arange(region_start, region_end * 2, int(step))
                while len(x) < len(eigenvector):
                    x = np.append(x[-1] + int(step))
                while len(eigenvector) < len(x):
                    x = x[:-1]

                pAxis.set_xlim(region_start, region_end * 2)
    if x is not None and eigenvector is not None:
        pAxis.fill_between(x, 0, eigenvector, edgecolor='none')
    pAxis.get_xaxis().set_visible(False)

예제 #11

0

파일 보기

파일: hicPlotViewpoint.py 프로젝트: tw7649116/HiCExplorer

def main(args=None):
    args = parse_arguments().parse_args(args)
    log.warning('This tool is deprecated. Please use chicViewpoint, chicViewpointBackgroundModel and chicPlotViewpoint.')
    if args.region:

        args.region = args.region.replace(",", "")
        args.region = args.region.replace(";", "")
        args.region = args.region.replace("!", "")
        args.region = args.region.replace("-", ":")
        region = args.region.split(":")
        if len(region) != 3:
            log.error("Region format is invalid {}".format(args.region))
            exit(0)
        chrom, region_start, region_end = region[0], int(region[1]), int(region[2])

    args.referencePoint = args.referencePoint.replace(",", "")
    args.referencePoint = args.referencePoint.replace(";", "")
    args.referencePoint = args.referencePoint.replace("!", "")
    args.referencePoint = args.referencePoint.replace("-", ":")
    referencePoint = args.referencePoint.split(":")

    data_list = []
    interactions_list = None
    if args.interactionOutFileName is not None:
        interactions_list = []
    matrix_name_legend = []
    for matrix in args.matrix:
        view_point_start, view_point_end, view_point_range, data_list_, interactions_list_ \
            = getViewpointValues(matrix, referencePoint, chrom, region_start, region_end, args.interactionOutFileName, args.chromosome)
        data_list.append(data_list_)
        if args.interactionOutFileName is not None:
            interactions_list.append(interactions_list_)
        matrix_name_legend.append(os.path.basename(matrix))

    fig = plt.figure(figsize=(6.4, 4.8))
    ax = plt.subplot(111)
    matrices_plot_legend = []
    for i, data in enumerate(data_list):
        matrices_plot_legend.append(ax.plot(range(len(data)), data, alpha=0.7, label=matrix_name_legend[i])[0])
    if len(referencePoint) == 2:
        log.debug("Single reference point mode: {}".format(referencePoint))
        log.debug("label 0: {}".format((int(referencePoint[1]) - region_start) * (-1)))
        log.debug("referencePoint[1]: {}".format(referencePoint[1]))
        log.debug("region_start: {}".format(region_start))
        log.debug("label 1: {}".format(referencePoint[0] + ":" + relabelTicks(int(referencePoint[1]))))
        log.debug("label 2: {}".format(region_end - int(referencePoint[1])))

        ax.set_xticks([0, view_point_start - view_point_range[0], view_point_range[1] - view_point_range[0]])
        xticklabels = [None] * 3
        xticklabels[0] = relabelTicks((int(referencePoint[1]) - region_start) * (-1))
        xticklabels[1] = referencePoint[0] + ":" + relabelTicks(int(referencePoint[1]))
        xticklabels[2] = relabelTicks(region_end - int(referencePoint[1]))

    elif len(referencePoint) == 3:
        log.debug("Range mode: {}".format(referencePoint))

        # fit scale: start coordinate is 0 --> view_point_range[0]
        ax.set_xticks([0, view_point_start - view_point_range[0], view_point_end - view_point_range[0], view_point_range[1] - view_point_range[0]])
        xticklabels = [None] * 4
        xticklabels[0] = relabelTicks((int(referencePoint[1]) - region_start) * (-1))
        xticklabels[1] = referencePoint[0] + ":" + relabelTicks(int(referencePoint[1]))
        xticklabels[2] = referencePoint[0] + ":" + relabelTicks(int(referencePoint[2]))
        xticklabels[3] = relabelTicks(region_end - int(referencePoint[1]))

    ax.set_xticklabels(xticklabels)
    ax.set_ylabel('Number of interactions')
    # left, width = .45, .5
    # bottom, height = .25, .7
    # right = left + width
    # top = bottom + height

    plt.legend(handles=matrices_plot_legend)
    plt.savefig(args.outFileName, dpi=args.dpi)
    plt.close(fig)

    if interactions_list is not None:
        for i, interactions_list_ in enumerate(interactions_list):
            with open(args.interactionOutFileName + '_' + matrix_name_legend[i] + '.bedgraph', 'w') as fh:
                for interaction in interactions_list_:
                    fh.write("{}\t{}\t{}\t{}\t{}\t{}\t{:.12f}\n".format(toString(interaction[0]), toString(interaction[1]), toString(interaction[2]), toString(interaction[3]), toString(interaction[4]), toString(interaction[5]), float(interaction[6])))

예제 #12

0

파일 보기

def main(args=None):
    args = parse_arguments().parse_args(args)
    if int(args.numberOfEigenvectors) != len(args.outputFileName):
        log.error(
            "Number of output file names and number of eigenvectors does not match. Please"
            "provide the name of each file.\nFiles: {}\nNumber of eigenvectors: {}"
            .format(args.outputFileName, args.numberOfEigenvectors))
        exit(1)

    ma = hm.hiCMatrix(args.matrix)
    ma.maskBins(ma.nan_bins)

    if args.chromosomes:
        ma.keepOnlyTheseChr(args.chromosomes)

    vecs_list = []
    chrom_list = []
    start_list = []
    end_list = []
    # PCA is computed per chromosome
    length_chromosome = 0
    chromosome_count = len(ma.getChrNames())
    if args.pearsonMatrix:
        trasf_matrix_pearson = lil_matrix(ma.matrix.shape)

    if args.obsexpMatrix:
        trasf_matrix_obsexp = lil_matrix(ma.matrix.shape)

    for chrname in ma.getChrNames():
        chr_range = ma.getChrBinRange(chrname)
        length_chromosome += chr_range[1] - chr_range[0]
    for chrname in ma.getChrNames():
        chr_range = ma.getChrBinRange(chrname)

        submatrix = ma.matrix[chr_range[0]:chr_range[1],
                              chr_range[0]:chr_range[1]]
        if args.norm:
            exp_obs_matrix_ = exp_obs_matrix_norm(submatrix, length_chromosome,
                                                  chromosome_count)
            exp_obs_matrix_ = convertNansToZeros(
                csr_matrix(exp_obs_matrix_)).todense()
            exp_obs_matrix_ = convertInfsToZeros(
                csr_matrix(exp_obs_matrix_)).todense()

        else:
            exp_obs_matrix_ = exp_obs_matrix_lieberman(submatrix,
                                                       length_chromosome,
                                                       chromosome_count)
            exp_obs_matrix_ = convertNansToZeros(
                csr_matrix(exp_obs_matrix_)).todense()
            exp_obs_matrix_ = convertInfsToZeros(
                csr_matrix(exp_obs_matrix_)).todense()

        if args.obsexpMatrix:
            trasf_matrix_obsexp[chr_range[0]:chr_range[1],
                                chr_range[0]:chr_range[1]] = lil_matrix(
                                    exp_obs_matrix_)

        pearson_correlation_matrix = np.corrcoef(exp_obs_matrix_)
        pearson_correlation_matrix = convertNansToZeros(
            csr_matrix(pearson_correlation_matrix)).todense()
        pearson_correlation_matrix = convertInfsToZeros(
            csr_matrix(pearson_correlation_matrix)).todense()

        if args.pearsonMatrix:
            trasf_matrix_pearson[chr_range[0]:chr_range[1],
                                 chr_range[0]:chr_range[1]] = lil_matrix(
                                     pearson_correlation_matrix)

        corrmatrix = np.cov(pearson_correlation_matrix)
        corrmatrix = convertNansToZeros(csr_matrix(corrmatrix)).todense()
        corrmatrix = convertInfsToZeros(csr_matrix(corrmatrix)).todense()
        evals, eigs = linalg.eig(corrmatrix)
        k = args.numberOfEigenvectors

        chrom, start, end, _ = zip(
            *ma.cut_intervals[chr_range[0]:chr_range[1]])
        vecs_list += eigs[:, :k].tolist()

        chrom_list += chrom
        start_list += start
        end_list += end

    if args.pearsonMatrix:
        file_type = 'cool'
        if args.pearsonMatrix.endswith('.h5'):
            file_type = 'h5'
        matrixFileHandlerOutput = MatrixFileHandler(pFileType=file_type)
        matrixFileHandlerOutput.set_matrix_variables(
            trasf_matrix_pearson.tocsr(), ma.cut_intervals, ma.nan_bins,
            ma.correction_factors, ma.distance_counts)
        matrixFileHandlerOutput.save(args.pearsonMatrix,
                                     pSymmetric=True,
                                     pApplyCorrection=False)

    if args.obsexpMatrix:
        file_type = 'cool'
        if args.obsexpMatrix.endswith('.h5'):
            file_type = 'h5'
        matrixFileHandlerOutput = MatrixFileHandler(pFileType=file_type)
        matrixFileHandlerOutput.set_matrix_variables(
            trasf_matrix_obsexp.tocsr(), ma.cut_intervals, ma.nan_bins,
            ma.correction_factors, ma.distance_counts)
        matrixFileHandlerOutput.save(args.obsexpMatrix,
                                     pSymmetric=True,
                                     pApplyCorrection=False)

    if args.geneTrack:
        vecs_list = correlateEigenvectorWithGeneTrack(ma, vecs_list,
                                                      args.geneTrack)

    if args.format == 'bedgraph':
        for idx, outfile in enumerate(args.outputFileName):
            assert (len(vecs_list) == len(chrom_list))

            with open(outfile, 'w') as fh:
                for i, value in enumerate(vecs_list):
                    if len(value) == args.numberOfEigenvectors:
                        if isinstance(value[idx], np.complex):
                            value[idx] = value[idx].real
                        fh.write("{}\t{}\t{}\t{:.12f}\n".format(
                            toString(chrom_list[i]), start_list[i],
                            end_list[i], value[idx]))

    elif args.format == 'bigwig':
        if not pyBigWig.numpy == 1:
            log.error(
                "ERROR: Your version of pyBigWig is not supporting numpy: {}".
                format(pyBigWig.__file__))
            exit(1)
        old_chrom = chrom_list[0]
        header = []
        for i, _chrom in enumerate(chrom_list):
            if old_chrom != _chrom:
                header.append((toString(old_chrom), end_list[i - 1]))
            old_chrom = _chrom

        header.append((toString(chrom_list[-1]), end_list[-1]))
        for idx, outfile in enumerate(args.outputFileName):
            log.debug("bigwig: len(vecs_list) {}".format(len(vecs_list)))
            log.debug("bigwig: len(chrom_list) {}".format(len(chrom_list)))

            assert (len(vecs_list) == len(chrom_list))
            _chrom_list = []
            _start_list = []
            _end_list = []
            values = []

            bw = pyBigWig.open(outfile, 'w')
            # set big wig header
            bw.addHeader(header)
            # create entry lists
            for i, value in enumerate(vecs_list):
                # it can happen that some 'value' is having less dimensions than it should
                if len(value) == args.numberOfEigenvectors:
                    if isinstance(value[idx], np.complex):
                        value[idx] = value[idx].real
                    values.append(value[idx])
                    _chrom_list.append(toString(chrom_list[i]))
                    _start_list.append(start_list[i])
                    _end_list.append(end_list[i])

            # write entries
            bw.addEntries(_chrom_list,
                          _start_list,
                          ends=_end_list,
                          values=values)
            bw.close()
    else:
        log.error("Output format not known: {}".format(args.format))
        exit(1)

예제 #13

0

파일 보기

파일: hicCorrectMatrix.py 프로젝트: wangyibin/HiCExplorer

def main(args=None):
    args = parse_arguments().parse_args(args)
    if args.verbose:
        log.setLevel(logging.INFO)

    # args.chromosomes
    if check_cooler(args.matrix) and args.chromosomes is not None and len(
            args.chromosomes) == 1:
        ma = hm.hiCMatrix(args.matrix, pChrnameList=toString(args.chromosomes))
    else:
        ma = hm.hiCMatrix(args.matrix)

        if args.chromosomes:
            ma.reorderChromosomes(toString(args.chromosomes))

    # mask all zero value bins
    if 'correctionMethod' in args:
        if args.correctionMethod == 'ICE':
            row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten()
            log.info("Removing {} zero value bins".format(sum(row_sum == 0)))
            ma.maskBins(np.flatnonzero(row_sum == 0))
            matrix_shape = ma.matrix.shape
    if 'plotName' in args:
        row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten()
        log.info("Removing {} zero value bins".format(sum(row_sum == 0)))
        ma.maskBins(np.flatnonzero(row_sum == 0))
        matrix_shape = ma.matrix.shape

    ma.matrix = convertNansToZeros(ma.matrix)
    ma.matrix = convertInfsToZeros(ma.matrix)
    ma.matrix = ma.matrix.astype(np.float64, copy=True)

    log.debug('ma.matrix.indices {}'.format(ma.matrix.indices.dtype))
    log.debug('ma.matrix.data {}'.format(ma.matrix.data.dtype))
    log.debug('ma.matrix.indptr {}'.format(ma.matrix.indptr.dtype))

    # log.debug('ma.matrix.indices {}'.format(np.max(ma.matrix.indices)))
    # log.debug('ma.matrix.data {}'.format(np.max(ma.matrix.data)))
    # log.debug('ma.matrix.indptr {}'.format(np.max(ma.matrix.indptr)))

    # ma.matrix.indptr = ma.matrix.indptr.astype(np.int32, copy=False)
    # ma.matrix.indices = ma.matrix.indices.astype(np.int32, copy=False)

    if 'plotName' in args:
        plot_total_contact_dist(ma, args)
        log.info("Saving diagnostic plot {}\n".format(args.plotName))
        return

    log.info("matrix contains {} data points. Sparsity {:.3f}.".format(
        len(ma.matrix.data),
        float(len(ma.matrix.data)) / (ma.matrix.shape[0]**2)))

    if args.skipDiagonal:
        ma.diagflat(value=0)

    total_filtered_out = set()
    if args.correctionMethod == 'ICE':
        if not args.filterThreshold:
            log.error('min and max filtering thresholds should be set')
            sys.exit(1)
        outlier_regions = filter_by_zscore(ma,
                                           args.filterThreshold[0],
                                           args.filterThreshold[1],
                                           perchr=args.perchr)
        # compute and print some statistics
        pct_outlier = 100 * float(len(outlier_regions)) / ma.matrix.shape[0]
        ma.printchrtoremove(outlier_regions,
                            label="Bins that are MAD outliers ({:.2f}%) "
                            "out of".format(pct_outlier, ma.matrix.shape[0]),
                            restore_masked_bins=False)

        assert matrix_shape == ma.matrix.shape
        # mask filtered regions
        ma.maskBins(outlier_regions)
        total_filtered_out = set(outlier_regions)

        if args.sequencedCountCutoff and 0 < args.sequencedCountCutoff < 1:
            chrom, _, _, coverage = zip(*ma.cut_intervals)

            assert type(coverage[0]) == np.float64

            failed_bins = np.flatnonzero(
                np.array(coverage) < args.sequencedCountCutoff)

            ma.printchrtoremove(failed_bins,
                                label="Bins with low coverage",
                                restore_masked_bins=False)
            ma.maskBins(failed_bins)
            total_filtered_out = set(failed_bins)
            """
            ma.matrix, to_remove = fill_gaps(ma, failed_bins)
            log.warning("From {} failed bins, {} could "
                         "not be filled\n".format(len(failed_bins),
                                                  len(to_remove)))
            ma.maskBins(to_remove)
            """

        if args.transCutoff and 0 < args.transCutoff < 100:
            cutoff = float(args.transCutoff) / 100
            # a usual cutoff is 0.05
            ma.truncTrans(high=cutoff)
            pre_row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten()

    correction_factors = []
    corrected_matrix = lil_matrix(ma.matrix.shape)
    if args.perchr:
        # normalize each chromosome independently
        for chrname in list(ma.interval_trees):
            chr_range = ma.getChrBinRange(chrname)
            chr_submatrix = ma.matrix[chr_range[0]:chr_range[1],
                                      chr_range[0]:chr_range[1]]
            if args.correctionMethod == 'ICE':
                _matrix, _corr_factors = iterative_correction(
                    chr_submatrix, args)
                corrected_matrix[chr_range[0]:chr_range[1],
                                 chr_range[0]:chr_range[1]] = _matrix
                correction_factors.append(_corr_factors)
            else:
                # Set the kr matrix along with its correction factors vector
                assert (args.correctionMethod == 'KR')
                log.debug("Loading a float sparse matrix for KR balancing")
                kr = kr_balancing(
                    chr_submatrix.shape[0], chr_submatrix.shape[1],
                    chr_submatrix.count_nonzero(),
                    chr_submatrix.indptr.astype(np.int64, copy=False),
                    chr_submatrix.indices.astype(np.int64, copy=False),
                    chr_submatrix.data.astype(np.float64, copy=False))
                kr.computeKR()
                if args.outFileName.endswith('.h5'):
                    corrected_matrix[
                        chr_range[0]:chr_range[1],
                        chr_range[0]:chr_range[1]] = kr.get_normalised_matrix(
                            True)
                # correction_factors.append(np.true_divide(1,
                #                                          kr.get_normalisation_vector(False).todense()))
                correction_factors.append(
                    kr.get_normalisation_vector(False).todense())

        correction_factors = np.concatenate(correction_factors)

    else:
        if args.correctionMethod == 'ICE':
            corrected_matrix, correction_factors = iterative_correction(
                ma.matrix, args)
            ma.setMatrixValues(corrected_matrix)
        else:
            assert (args.correctionMethod == 'KR')
            log.debug("Loading a float sparse matrix for KR balancing")
            kr = kr_balancing(ma.matrix.shape[0], ma.matrix.shape[1],
                              ma.matrix.count_nonzero(),
                              ma.matrix.indptr.astype(np.int64, copy=False),
                              ma.matrix.indices.astype(np.int64, copy=False),
                              ma.matrix.data.astype(np.float64, copy=False))
            log.debug('passed pointers')
            kr.computeKR()
            log.debug('computation done')

            # set it to False since the vector is already normalised
            # with the previous True
            # correction_factors = np.true_divide(1, kr.get_normalisation_vector(False).todense())
            correction_factors = kr.get_normalisation_vector(False).todense()

            if args.outFileName.endswith('.h5'):
                corrected_matrix = kr.get_normalised_matrix(True)

    if args.outFileName.endswith('.h5'):
        ma.setMatrixValues(corrected_matrix)
    # if
    ma.setCorrectionFactors(correction_factors)

    log.debug("Correction factors {}".format(correction_factors[:10]))
    if args.inflationCutoff and args.inflationCutoff > 0 and args.correctionMethod == 'ICE':

        after_row_sum = np.asarray(corrected_matrix.sum(axis=1)).flatten()
        # identify rows that were expanded more than args.inflationCutoff times
        to_remove = np.flatnonzero(
            after_row_sum / pre_row_sum >= args.inflationCutoff)
        ma.printchrtoremove(to_remove,
                            label="inflated >={} "
                            "regions".format(args.inflationCutoff),
                            restore_masked_bins=False)
        total_filtered_out = total_filtered_out.union(to_remove)
        ma.maskBins(to_remove)
    ma.printchrtoremove(sorted(list(total_filtered_out)),
                        label="Total regions to be removed",
                        restore_masked_bins=False)

    ma.save(args.outFileName, pApplyCorrection=False)

예제 #14

0

파일 보기

def main(args=None):
    args = parse_arguments().parse_args(args)
    if args.title:
        args.title = remove_non_ascii(args.title)

    chrom = None
    start_pos1 = None
    chrom2 = None
    start_pos2 = None

    if args.perChromosome and args.region:
        log.error('ERROR, choose from the option '
                  '--perChromosome or --region, the two '
                  'options at the same time are not '
                  'compatible.')
        exit(1)

    # if args.region and args.region2 and args.bigwig:
    #     log.error("Inter-chromosomal pca is not supported.")
    #     exit(1)
    # is_cooler = False
    # if args.matrix.endswith('.cool') or cooler.io.is_cooler(args.matrix) or'.mcool' in args.matrix:
    is_cooler = check_cooler(args.matrix)
    log.debug("Cooler or no cooler: {}".format(is_cooler))
    open_cooler_chromosome_order = True
    if args.chromosomeOrder is not None and len(args.chromosomeOrder) > 1:
        open_cooler_chromosome_order = False

    if is_cooler and not args.region2 and open_cooler_chromosome_order:
        log.debug("Retrieve data from cooler format and use its benefits.")
        regionsToRetrieve = None
        if args.region:
            regionsToRetrieve = []
            regionsToRetrieve.append(args.region)
            # if args.region2:
            #     chrom2, region_start2, region_end2 = translate_region(args.region2)
            #     regionsToRetrieve.append(args.region2)
        if args.chromosomeOrder:
            args.region = None
            args.region2 = None
            regionsToRetrieve = args.chromosomeOrder

        ma = HiCMatrix.hiCMatrix(args.matrix, pChrnameList=regionsToRetrieve)
        log.debug('Shape {}'.format(ma.matrix.shape))
        if args.clearMaskedBins:
            ma.maskBins(ma.nan_bins)
            # to avoid gaps in the plot, bins flanking the masked bins
            # are enlarged
            new_intervals = enlarge_bins(ma.cut_intervals)
            ma.setCutIntervals(new_intervals)

        if args.region:
            chrom, region_start, region_end, idx1, start_pos1, chrom2, region_start2, region_end2, idx2, start_pos2 = getRegion(args, ma)

        matrix = np.asarray(ma.matrix.todense().astype(float))
        matrix_length = len(matrix[0])
        log.debug("Number of data points matrix_cool: {}".format(matrix_length))
    else:
        ma = HiCMatrix.hiCMatrix(args.matrix)
        if args.clearMaskedBins:
            ma.maskBins(ma.nan_bins)
            new_intervals = enlarge_bins(ma.cut_intervals)
            ma.setCutIntervals(new_intervals)
        if args.chromosomeOrder:
            args.region = None
            args.region2 = None

            valid_chromosomes = []
            invalid_chromosomes = []
            log.debug('args.chromosomeOrder: {}'.format(args.chromosomeOrder))
            log.debug("ma.chrBinBoundaries {}".format(ma.chrBinBoundaries))
            if sys.version_info[0] == 3:
                args.chromosomeOrder = toBytes(args.chromosomeOrder)
            for chrom in toString(args.chromosomeOrder):
                if chrom in ma.chrBinBoundaries:
                    valid_chromosomes.append(chrom)
                else:
                    invalid_chromosomes.append(chrom)

            if len(invalid_chromosomes) > 0:
                log.warning("WARNING: The following chromosome/scaffold names were not found. Please check"
                            "the correct spelling of the chromosome names. \n")
                log.warning("\n".join(invalid_chromosomes))
            ma.reorderChromosomes(valid_chromosomes)

        log.info("min: {}, max: {}\n".format(ma.matrix.data.min(), ma.matrix.data.max()))

        if args.region:
            chrom, region_start, region_end, idx1, start_pos1, chrom2, region_start2, region_end2, idx2, start_pos2 = getRegion(args, ma)

            matrix = np.asarray(ma.matrix[idx1, :][:, idx2].todense().astype(float))

        else:
            log.debug("Else branch")
            matrix = np.asarray(ma.getMatrix().astype(float))

    matrix_length = len(matrix[0])
    log.debug("Number of data points matrix: {}".format(matrix_length))

    for matrix_ in matrix:
        if not matrix_length == len(matrix_):
            log.error("Matrices do not have the same length: {} , {}".format(matrix_length, len(matrix_)))

    cmap = cm.get_cmap(args.colorMap)
    log.debug("Nan values set to black\n")
    cmap.set_bad('black')

    bigwig_info = None
    if args.bigwig:
        bigwig_info = {'args': args, 'axis': None, 'axis_colorbar': None, 'nan_bins': ma.nan_bins}

    if args.perChromosome:
        fig = plotPerChr(ma, cmap, args, pBigwig=bigwig_info)

    else:
        norm = None

        if args.log or args.log1p:
            mask = matrix == 0
            matrix[mask] = np.nanmin(matrix[mask == False])

            if np.isnan(matrix).any() or np.isinf(matrix).any():
                log.debug("any nan {}".format(np.isnan(matrix).any()))
                log.debug("any inf {}".format(np.isinf(matrix).any()))
                mask_nan = np.isnan(matrix)
                mask_inf = np.isinf(matrix)
                matrix[mask_nan] = np.nanmin(matrix[mask_nan == False])
                matrix[mask_inf] = np.nanmin(matrix[mask_inf == False])

        log.debug("any nan after remove of nan: {}".format(np.isnan(matrix).any()))
        log.debug("any inf after remove of inf: {}".format(np.isinf(matrix).any()))
        if args.log1p:
            matrix += 1
            norm = LogNorm()
        elif args.log:
            norm = LogNorm()

        if args.bigwig:
            # increase figure height to accommodate bigwig track
            fig_height = 8.5
        else:
            fig_height = 7
        height = 4.8 / fig_height

        fig_width = 8
        width = 5.0 / fig_width
        left_margin = (1.0 - width) * 0.5

        fig = plt.figure(figsize=(fig_width, fig_height), dpi=args.dpi)

        if args.bigwig:
            gs = gridspec.GridSpec(2, 2, height_ratios=[0.90, 0.1], width_ratios=[0.97, 0.03])
            gs.update(hspace=0.05, wspace=0.05)
            ax1 = plt.subplot(gs[0, 0])
            ax2 = plt.subplot(gs[1, 0])
            ax3 = plt.subplot(gs[0, 1])
            bigwig_info['axis'] = ax2
            bigwig_info['axis_colorbar'] = ax3
        else:
            ax1 = None
        bottom = 1.3 / fig_height

        if start_pos1 is None:
            start_pos1 = make_start_pos_array(ma)

        position = [left_margin, bottom, width, height]
        plotHeatmap(matrix, ma.get_chromosome_sizes(), fig, position,
                    args, cmap, xlabel=chrom, ylabel=chrom2,
                    start_pos=start_pos1, start_pos2=start_pos2, pNorm=norm, pAxis=ax1, pBigwig=bigwig_info)

    if not args.disable_tight_layout:
        if args.perChromosome or args.bigwig:
            try:
                plt.tight_layout()
            except UserWarning:
                log.info("Failed to tight layout. Using regular plot.")
            except ValueError:
                log.info("Failed to tight layout. Using regular plot.")

    plt.savefig(args.outFileName, dpi=args.dpi)
    plt.close(fig)

예제 #15

0

파일 보기

def plotPerChr(hic_matrix, cmap, args, pBigwig):
    """
    plots each chromosome individually, one after the other
    in one row. scale bar is added at the end
    """
    from math import ceil
    chromosomes = hic_matrix.getChrNames()
    chrom_per_row = 5
    num_rows = int(ceil(float(len(chromosomes)) / chrom_per_row))
    num_cols = min(chrom_per_row, len(chromosomes))
    width_ratios = [1.0] * num_cols + [0.05]
    grids = gridspec.GridSpec(num_rows, num_cols + 1,
                              width_ratios=width_ratios,
                              height_ratios=[1] * num_rows)

    fig_height = 6 * num_rows
    fig_width = sum((np.array(width_ratios) + 0.05) * 6)

    fig = plt.figure(figsize=(fig_width, fig_height), dpi=args.dpi)

    chrom, start, end, _ = zip(*hic_matrix.cut_intervals)
    for idx, chrname in enumerate(chromosomes):
        log.debug('chrom: {}'.format(chrname))

        row = idx // chrom_per_row
        col = idx % chrom_per_row
        if pBigwig:
            inner_grid = gridspec.GridSpecFromSubplotSpec(2, 2, height_ratios=[0.85, 0.15], width_ratios=[0.93, 0.07],
                                                          subplot_spec=grids[row, col], wspace=0.1, hspace=0.1)
            axis = plt.subplot(inner_grid[0, 0])
            axis_eigenvector = plt.subplot(inner_grid[1, 0])
            axis_scale = plt.subplot(inner_grid[0, 1])

        else:
            axis = plt.subplot(grids[row, col])
            axis.set_title(toString(chrname))
        chrom_range = hic_matrix.getChrBinRange(chrname)
        matrix = np.asarray(hic_matrix.matrix[chrom_range[0]:chrom_range[1],
                                              chrom_range[0]:chrom_range[1]].todense().astype(float))

        norm = None
        if args.log or args.log1p:
            mask = matrix == 0
            mask_nan = np.isnan(matrix)
            mask_inf = np.isinf(matrix)
            log.debug("any nan {}".format(np.isnan(matrix).any()))
            log.debug("any inf {}".format(np.isinf(matrix).any()))

            try:
                matrix[mask] = np.nanmin(matrix[mask == False])
                matrix[mask_nan] = np.nanmin(matrix[mask_nan == False])
                matrix[mask_inf] = np.nanmin(matrix[mask_inf == False])

            except Exception:
                log.debug("Clearing of matrix failed.")
            log.debug("any nanafter remove of nan: {}".format(np.isnan(matrix).any()))
            log.debug("any inf after remove of inf: {}".format(np.isinf(matrix).any()))
        if args.log1p:
            matrix += 1
            norm = LogNorm()

        elif args.log:
            norm = LogNorm()

        bigwig_info = None
        if pBigwig:
            bigwig_info = {'args': args, 'axis': None, 'axis_colorbar': None, 'nan_bins': hic_matrix.nan_bins}
            bigwig_info['axis'] = axis_eigenvector
            bigwig_info['axis_colorbar'] = axis_scale

        chr_bin_boundary = OrderedDict()
        chr_bin_boundary[chrname] = hic_matrix.get_chromosome_sizes()[chrname]

        args.region = toString(chrname)
        chrom, region_start, region_end, idx1, start_pos1, chrom2, region_start2, region_end2, idx2, start_pos2 = getRegion(args, hic_matrix)
        plotHeatmap(matrix, chr_bin_boundary, fig, None,
                    args, cmap, xlabel=chrname, ylabel=chrname,
                    start_pos=start_pos1, start_pos2=start_pos2, pNorm=norm, pAxis=axis, pBigwig=bigwig_info)
    return fig

예제 #16

0

파일 보기

def plotHeatmap(ma, chrBinBoundaries, fig, position, args, cmap, xlabel=None,
                ylabel=None, start_pos=None, start_pos2=None, pNorm=None, pAxis=None, pBigwig=None):
    log.debug("plotting heatmap")
    if ma.shape[0] < 5:
        # This happens when a tiny matrix wants to be plotted, or by using per chromosome and
        # a small chromosome (eg. contig) is present.
        # Otherwise, pcolormesh will throw an error if the matrix size is 1.
        chr_names = " ".join([toString(x) for x in chrBinBoundaries.keys()])
        log.info("Matrix for {} too small to plot. Matrix size: {}".format(chr_names, ma.shape))
        return
    if pAxis is not None:
        axHeat2 = pAxis
    else:
        axHeat2 = fig.add_axes(position)

    if args.title:
        axHeat2.set_title(toString(args.title))

    if start_pos2 is None:
        start_pos2 = start_pos

    xmesh, ymesh = np.meshgrid(start_pos, start_pos2)

    img3 = axHeat2.pcolormesh(xmesh.T, ymesh.T, ma, vmin=args.vMin, vmax=args.vMax, cmap=cmap, norm=pNorm)
    axHeat2.invert_yaxis()
    img3.set_rasterized(True)

    if args.region:
        xtick_lables = relabel_ticks(axHeat2.get_xticks())
        axHeat2.get_xaxis().set_tick_params(which='both', bottom='on', direction='out')
        axHeat2.set_xticklabels(xtick_lables, size='small', rotation=45)

        ytick_lables = relabel_ticks(axHeat2.get_yticks())
        axHeat2.get_yaxis().set_tick_params(which='both', bottom='on', direction='out')
        axHeat2.set_yticklabels(ytick_lables, size='small')
        xticks = [xtick_lables]
        """
        axHeat2.set_xticks([0, ma.shape[0]])
        axHeat2.set_xticklabels([args.region[1], args.region[2]], size=4, rotation=90)
        axHeat2.set_axis_off()
        """
    else:

        pos = 0
        ticks = []
        for chr_size in chrBinBoundaries.values():
            ticks.append(pos)
            pos += chr_size
        # ticks = [int(pos[0] + (pos[1] - pos[0]) / 2) for pos in itervalues(chrBinBoundaries)]
        labels = list(chrBinBoundaries)
        axHeat2.set_xticks(ticks)
        axHeat2.set_yticks(ticks)
        labels = toString(labels)
        xticks = [labels, ticks]

        if len(labels) > 20:
            axHeat2.set_xticklabels(labels, size=4, rotation=90)
            axHeat2.set_yticklabels(labels, size=4)

        else:
            axHeat2.set_xticklabels(labels, size=8)
            axHeat2.set_yticklabels(labels, size=8)

    if pBigwig is None:
        divider = make_axes_locatable(axHeat2)
        cax = divider.append_axes("right", size="2.5%", pad=0.09)
    else:
        cax = pBigwig['axis_colorbar']

    cbar = fig.colorbar(img3, cax=cax)

    cbar.solids.set_edgecolor("face")  # to avoid white lines in the color bar in pdf plots
    if args.scoreName:
        cbar.ax.set_ylabel(args.scoreName, rotation=270, size=8)

    if ylabel is not None:
        ylabel = toString(ylabel)
        axHeat2.set_ylabel(ylabel)

    if xlabel is not None:
        xlabel = toString(xlabel)
        axHeat2.set_xlabel(xlabel)
    log.debug('foo')
    if pBigwig:
        axHeat2.xaxis.set_label_position("top")
        axHeat2.xaxis.tick_top()
        if args.region:
            log.debug('region')
            plotBigwig(pBigwig['axis'], pBigwig['args'].bigwig, pChromosomeSizes=chrBinBoundaries,
                       pRegion=pBigwig['args'].region, pXticks=xticks, pFlipBigwigSign=args.flipBigwigSign,
                       pScaleFactorBigwig=args.scaleFactorBigwig, pValueMin=args.vMinBigwig, pValueMax=args.vMaxBigwig)
        else:
            log.debug('else region')
            plotBigwig(pBigwig['axis'], pBigwig['args'].bigwig, pXticks=xticks, pChromosomeSizes=chrBinBoundaries,
                       pFlipBigwigSign=args.flipBigwigSign, pScaleFactorBigwig=args.scaleFactorBigwig, 
                       pValueMin=args.vMinBigwig, pValueMax=args.vMaxBigwig)

예제 #17

0

파일 보기

def main(args=None):
    args = parse_arguments().parse_args(args)
    if int(args.numberOfEigenvectors) != len(args.outputFileName):
        log.error(
            "Number of output file names and number of eigenvectors does not match. Please"
            "provide the name of each file.\nFiles: {}\nNumber of eigenvectors: {}"
            .format(args.outputFileName, args.numberOfEigenvectors))
        exit(1)

    ma = hm.hiCMatrix(args.matrix)
    ma.maskBins(ma.nan_bins)

    if args.chromosomes:
        ma.keepOnlyTheseChr(args.chromosomes)

    vecs_list = []
    chrom_list = []
    start_list = []
    end_list = []
    # PCA is computed per chromosome
    length_chromosome = 0
    chromosome_count = len(ma.getChrNames())
    for chrname in ma.getChrNames():
        chr_range = ma.getChrBinRange(chrname)
        length_chromosome += chr_range[1] - chr_range[0]
    for chrname in ma.getChrNames():
        chr_range = ma.getChrBinRange(chrname)
        log.debug("Computing pca for chromosome: {}".format(chrname))

        submatrix = ma.matrix[chr_range[0]:chr_range[1],
                              chr_range[0]:chr_range[1]]

        exp_obs_matrix_ = exp_obs_matrix_lieberman(submatrix,
                                                   length_chromosome,
                                                   chromosome_count)
        exp_obs_matrix_ = convertNansToZeros(
            csr_matrix(exp_obs_matrix_)).todense()
        exp_obs_matrix_ = convertInfsToZeros(
            csr_matrix(exp_obs_matrix_)).todense()

        pearson_correlation_matrix = np.corrcoef(exp_obs_matrix_)
        pearson_correlation_matrix = convertNansToZeros(
            csr_matrix(pearson_correlation_matrix)).todense()
        pearson_correlation_matrix = convertInfsToZeros(
            csr_matrix(pearson_correlation_matrix)).todense()
        corrmatrix = np.cov(pearson_correlation_matrix)
        corrmatrix = convertNansToZeros(csr_matrix(corrmatrix)).todense()
        corrmatrix = convertInfsToZeros(csr_matrix(corrmatrix)).todense()
        evals, eigs = linalg.eig(corrmatrix)
        k = args.numberOfEigenvectors

        chrom, start, end, _ = zip(
            *ma.cut_intervals[chr_range[0]:chr_range[1]])
        vecs_list += eigs[:, :k].tolist()

        chrom_list += chrom
        start_list += start
        end_list += end

    if args.format == 'bedgraph':
        for idx, outfile in enumerate(args.outputFileName):
            assert (len(vecs_list) == len(chrom_list))

            with open(outfile, 'w') as fh:
                for i, value in enumerate(vecs_list):
                    if len(value) == args.numberOfEigenvectors:
                        if isinstance(value[idx], np.complex):
                            value[idx] = value[idx].real
                        fh.write("{}\t{}\t{}\t{:.12f}\n".format(
                            toString(chrom_list[i]), start_list[i],
                            end_list[i], value[idx]))

    elif args.format == 'bigwig':
        if not pyBigWig.numpy == 1:
            log.error(
                "ERROR: Your version of pyBigWig is not supporting numpy: {}".
                format(pyBigWig.__file__))
            exit(1)
        old_chrom = chrom_list[0]
        header = []
        for i, chrom_ in enumerate(chrom_list):
            if old_chrom != chrom_:
                header.append((toString(old_chrom), end_list[i - 1]))
            old_chrom = chrom_

        header.append((toString(chrom_list[-1]), end_list[-1]))
        for idx, outfile in enumerate(args.outputFileName):
            log.debug("bigwig: len(vecs_list) {}".format(len(vecs_list)))
            log.debug("bigwig: len(chrom_list) {}".format(len(chrom_list)))

            assert (len(vecs_list) == len(chrom_list))
            chrom_list_ = []
            start_list_ = []
            end_list_ = []
            values = []

            bw = pyBigWig.open(outfile, 'w')
            # set big wig header
            bw.addHeader(header)
            # create entry lists
            for i, value in enumerate(vecs_list):
                # it can happen that some 'value' is having less dimensions than it should
                if len(value) == args.numberOfEigenvectors:
                    if isinstance(value[idx], np.complex):
                        value[idx] = value[idx].real
                    values.append(value[idx])
                    chrom_list_.append(toString(chrom_list[i]))
                    start_list_.append(start_list[i])
                    end_list_.append(end_list[i])

            # write entries
            bw.addEntries(chrom_list_,
                          start_list_,
                          ends=end_list_,
                          values=values)
            bw.close()
    else:
        log.error("Output format not known: {}".format(args.format))
        exit(1)

예제 #18

0

파일 보기

파일: readBed.py 프로젝트: wangyibin/HiCExplorer

    def get_bed_interval(self, bed_line):
        r"""
        Processes each bed line from a bed file, casts the values and returns
        a namedtuple object

        >>> bed_line="chr1\t0\t1000\tgene_1\t0.5\t-\t0\t1000\t0\t3\t10,20,100\t20,200,700"
        >>> with open('/tmp/test.bed', 'w') as fh:
        ...     foo = fh.write(bed_line)
        >>> bed_f = ReadBed(open('/tmp/test.bed','r'))
        >>> bed = bed_f.get_bed_interval(bed_line)
        >>> bed.chromosome
        'chr1'
        >>> bed.block_starts
        [20, 200, 700]

        >>> bed_line="chr2\t0\t1000\tgene_1\t0.5\t-\n"
        >>> with open('/tmp/test.bed', 'w') as fh:
        ...     foo = fh.write(bed_line)
        >>> bed_f = ReadBed(open('/tmp/test.bed','r'))
        >>> bed_f.get_bed_interval(bed_line)
        BedInterval(chromosome='chr2', start=0, end=1000, name='gene_1', score=0.5, strand='-')
        """

        line_data = bed_line.strip()
        line_data = toString(line_data)
        line_data = line_data.split("\t")

        if self.file_handle == 'bed12':
            assert len(line_data) == 12, "File type detected is bed12 but line {}: {} does " \
                                         "not have 12 fields.".format(self.line_number, bed_line)

        elif self.file_type == 'bed3':
            assert len(line_data) == 3, "File type detected is bed3 but line {}: {} does " \
                "not have 3 fields.".format(self.line_number, bed_line)

        elif self.file_type == 'bed6':
            assert len(line_data) == 6, "File type detected is bed6 but line {}: {} does " \
                "not have 6 fields.".format(self.line_number, bed_line)
        line_values = []
        for idx, r in enumerate(line_data):
            # first field is always chromosome/contig name
            # and should be cast as a string
            # same for field 3 (name)
            if idx in [0, 3]:
                line_values.append(r)
            # check field strand
            elif idx == 5:
                if r not in ['+', '-', '.']:
                    if r == '1':
                        r = '+'
                    elif r == '-1':
                        r = '-'
                    else:
                        log.warning(
                            "*Warning, invalid strand value found {} for line #{}:\n{}\n "
                            "Setting strand to '.'\n".format(
                                r, bed_line, self.line_number))
                        r = '.'
                line_values.append(r)

            elif idx in [1, 2, 6, 7, 9]:
                # start and end fields must be integers, same for thichStart(6),
                # and thickEnd(7) and blockCount(9) fields
                try:
                    line_values.append(int(r))
                except ValueError:
                    log.warning(
                        "Value: {} in field {} at line {} is not an integer\n".
                        format(r, idx + 1, self.line_number))
                    return dict()
            # check item rgb
            elif idx == 8:
                r = toString(r)
                rgb = r.split(",")

                if len(rgb) == 3:
                    try:
                        r = map(int, rgb)
                    except ValueError as detail:
                        log.debug(
                            "Error reading line: #{}. The rgb field {} is not "
                            "valid.\nError message: {}\n".format(
                                self.line_number, r, detail))
                line_values.append(r)

            elif idx in [10, 11]:
                # this are the block sizes and block start positions
                r = toString(r)
                r_parts = r.split(',')

                try:
                    r = [int(x) for x in r_parts if x != '']
                except ValueError as detail:
                    log.debug(
                        "Error reading line #{}. The block field {} is not "
                        "valid.\nError message: {}\n".format(
                            self.line_number, r, detail))
                line_values.append(r)

            else:
                try:
                    tmp = float(r)
                except ValueError:
                    tmp = r
                except TypeError:
                    tmp = r
                line_values.append(tmp)

        assert line_values[2] > line_values[1], \
            "Start position larger or equal than end for line #{}:\n{}\n".format(self.line_number,
                                                                                 bed_line)

        if self.file_type == 'bed3':
            line_values = line_values[0:3]
            # in case of a bed3, the id, score and strand
            # values are added as ".", 0, "." respectively
            line_values.extend([".", 0, "."])
        elif self.file_type == 'bed6':
            line_values = line_values[0:6]

        return self.BedInterval._make(line_values)

예제 #19

0

파일 보기

def plotHeatmap(ma,
                chrBinBoundaries,
                fig,
                position,
                args,
                cmap,
                xlabel=None,
                ylabel=None,
                start_pos=None,
                start_pos2=None,
                pNorm=None,
                pAxis=None,
                pPca=None):
    log.debug("plotting heatmap")
    if ma.shape[0] < 5:
        log.info("Matrix for {} too small to plot. Matrix size: {}".format(
            chrBinBoundaries.keys()[0], ma.shape))
        return
    if pAxis is not None:
        axHeat2 = pAxis
    else:
        axHeat2 = fig.add_axes(position)

    if args.title:
        axHeat2.set_title(toString(args.title))

    if start_pos is None:
        start_pos = np.arange(ma.shape[0])
    if start_pos2 is None:
        start_pos2 = start_pos

    xmesh, ymesh = np.meshgrid(start_pos, start_pos2)

    img3 = axHeat2.pcolormesh(xmesh.T,
                              ymesh.T,
                              ma,
                              vmin=args.vMin,
                              vmax=args.vMax,
                              cmap=cmap,
                              norm=pNorm)
    axHeat2.invert_yaxis()
    img3.set_rasterized(True)
    xticks = None
    if args.region:
        xtick_lables = relabel_ticks(axHeat2.get_xticks())
        axHeat2.get_xaxis().set_tick_params(which='both',
                                            bottom='on',
                                            direction='out')
        axHeat2.set_xticklabels(xtick_lables, size='small', rotation=45)

        ytick_lables = relabel_ticks(axHeat2.get_yticks())
        axHeat2.get_yaxis().set_tick_params(which='both',
                                            bottom='on',
                                            direction='out')
        axHeat2.set_yticklabels(ytick_lables, size='small')
        xticks = [xtick_lables]
        """
        axHeat2.set_xticks([0, ma.shape[0]])
        axHeat2.set_xticklabels([args.region[1], args.region[2]], size=4, rotation=90)
        axHeat2.set_axis_off()
        """
    else:

        ticks = [
            int(pos[0] + (pos[1] - pos[0]) / 2)
            for pos in itervalues(chrBinBoundaries)
        ]
        labels = list(chrBinBoundaries)
        axHeat2.set_xticks(ticks)
        axHeat2.set_yticks(ticks)
        labels = toString(labels)
        xticks = [labels, ticks]

        if len(labels) > 20:
            axHeat2.set_xticklabels(labels, size=4, rotation=90)
            axHeat2.set_yticklabels(labels, size=4)

        else:
            axHeat2.set_xticklabels(labels, size=8)
            axHeat2.set_yticklabels(labels, size=8)

    if pPca is None:
        divider = make_axes_locatable(axHeat2)
        cax = divider.append_axes("right", size="2.5%", pad=0.09)
    else:
        cax = pPca['axis_colorbar']
    if args.log1p:
        from matplotlib.ticker import LogFormatter
        formatter = LogFormatter(10, labelOnlyBase=False)
        # get a useful log scale
        # that looks like [1, 2, 5, 10, 20, 50, 100, ... etc]
        aa = np.array([1, 2, 5])
        tick_values = np.concatenate([aa * 10**x for x in range(10)])
        cbar = fig.colorbar(img3, ticks=tick_values, format=formatter, cax=cax)
    else:
        cbar = fig.colorbar(img3, cax=cax)

    cbar.solids.set_edgecolor(
        "face")  # to avoid white lines in the color bar in pdf plots
    if args.scoreName:
        cbar.ax.set_ylabel(args.scoreName, rotation=270, size=8)

    if ylabel is not None:
        ylabel = toString(ylabel)
        axHeat2.set_ylabel(ylabel)

    if xlabel is not None:
        xlabel = toString(xlabel)
        axHeat2.set_xlabel(xlabel)

    if pPca:
        axHeat2.xaxis.set_label_position("top")
        axHeat2.xaxis.tick_top()
        if args.region:
            plotEigenvector(pPca['axis'],
                            pPca['args'].pca,
                            pRegion=pPca['args'].region,
                            pXticks=xticks)
        else:
            plotEigenvector(pPca['axis'],
                            pPca['args'].pca,
                            pXticks=xticks,
                            pChromosomeList=labels)

예제 #20

0

파일 보기

파일: hicPlotMatrix.py 프로젝트: ryys1122/HiCExplorer

def plotPerChr(hic_matrix, cmap, args, pBigwig, pResolution):
    """
    plots each chromosome individually, one after the other
    in one row. scale bar is added at the end
    """
    from math import ceil
    chromosomes = hic_matrix.getChrNames()
    chrom_per_row = 5
    num_rows = int(ceil(float(len(chromosomes)) / chrom_per_row))
    num_cols = min(chrom_per_row, len(chromosomes))
    width_ratios = [1.0] * num_cols + [0.05]
    grids = gridspec.GridSpec(num_rows, num_cols + 1,
                              width_ratios=width_ratios,
                              height_ratios=[1] * num_rows)

    fig_height = 6 * num_rows
    fig_width = sum((np.array(width_ratios) + 0.05) * 6)
    if pBigwig:
        for i in range(len(args.bigwig)):
            fig_height += args.increaseFigureHeight
            # if args.bigwigAdditionalVerticalAxis:
            fig_width += args.increaseFigureWidth

    fig = plt.figure(figsize=(fig_width, fig_height), dpi=args.dpi)

    chrom, start, end, _ = zip(*hic_matrix.cut_intervals)
    for idx, chrname in enumerate(chromosomes):
        log.debug('chrom: {}'.format(chrname))
        bigwig_info = None
        # if pBigwig:
        # bigwig_info['axis'] = axis_eigenvector
        # bigwig_info['axis_colorbar'] = axis_scale
        row = idx // chrom_per_row
        col = idx % chrom_per_row
        if pBigwig:
            bigwig_info = {'args': args, 'axis': None,
                           'axis_colorbar': None, 'nan_bins': hic_matrix.nan_bins}
            # bigwig_info, axis = bigwig_axes_config(args, bigwig_info)
            # bigwig_info['nan_bins'] = hic_matrix.nan_bins
            # bigwig_info['args'] = args

            # inner_grid = gridspec.GridSpecFromSubplotSpec(2, 2, height_ratios=[0.85, 0.15], width_ratios=[0.93, 0.07],
            #                                               subplot_spec=grids[row, col], wspace=0.1, hspace=0.1)
            # axis = plt.subplot(inner_grid[0, 0])
            # axis_eigenvector = plt.subplot(inner_grid[1, 0])
            # axis_scale = plt.subplot(inner_grid[0, 1])
            number_of_rows_plot = len(args.bigwig)
            bigwig_heights = [0.07] * number_of_rows_plot
            bigwig_height_ratio = 0.95 - (0.07 * number_of_rows_plot)
            if bigwig_height_ratio < 0.4:
                bigwig_height_ratio = 0.4
                _ratio = 0.6 / len(number_of_rows_plot)
                bigwig_heights = [_ratio] * number_of_rows_plot

            if args.bigwigAdditionalVerticalAxis:
                # gs = gridspec.GridSpecFromSubplotSpec(1 + len(args.bigwig), 3, height_ratios=[0.90, 0.1], width_ratios=[0.15, 0.82, 0.03],
                #                                       subplot_spec=grids[row, col], wspace=0.1, hspace=0.1)
                # # gs = gridspec.GridSpec(1 + len(args.bigwig), 3, height_ratios=[0.90, 0.1], width_ratios=[0.15, 0.82, 0.03])
                # # gs.update(hspace=0.05, wspace=0.05)
                # bigwig_vertical_axis = plt.subplot(gs[0, 0])
                # axis = plt.subplot(gs[0, 1])
                # ax2 = plt.subplot(gs[1, 1])
                # ax3 = plt.subplot(gs[0, 2])

                # bigwig_info['axis'] = ax2
                # bigwig_info['axis_colorbar'] = ax3
                # bigwig_info['axis_vertical'] = bigwig_vertical_axis

                gs = gridspec.GridSpecFromSubplotSpec(1 + len(args.bigwig), 2 + len(args.bigwig), height_ratios=[0.95 - (0.07 * number_of_rows_plot), *bigwig_heights], width_ratios=[*bigwig_heights, 0.97 - (0.07 * number_of_rows_plot), 0.03],
                                                      subplot_spec=grids[row, col], wspace=0.1, hspace=0.1)
                # gs.update(hspace=0.05, wspace=0.05)
                # gs.update(hspace=0.05, wspace=0.05)
                axis = plt.subplot(gs[0, len(args.bigwig)])
                ax2_list = []
                for i in range(len(args.bigwig)):
                    ax2_list.append(plt.subplot(gs[1 + i, len(args.bigwig)]))

                bigwig_vertical_axis_list = []
                for i in range(len(args.bigwig)):
                    bigwig_vertical_axis_list.append(plt.subplot(gs[0, i]))
                # ax2 = plt.subplot(gs[1, 0])
                ax3 = plt.subplot(gs[0, len(args.bigwig) + 1])
                bigwig_info['axis'] = ax2_list
                bigwig_info['axis_colorbar'] = ax3
                bigwig_info['axis_vertical'] = bigwig_vertical_axis_list

            else:
                # [0.95 - (0.07 * number_of_rows_plot), *z_score_heights], width_ratios=[0.75, 0.25])
                gs = gridspec.GridSpecFromSubplotSpec(1 + len(args.bigwig), 2, height_ratios=[0.95 - (0.07 * number_of_rows_plot), *bigwig_heights], width_ratios=[0.97, 0.03],
                                                      subplot_spec=grids[row, col], wspace=0.1, hspace=0.1)
                # gs.update(hspace=0.05, wspace=0.05)
                axis = plt.subplot(gs[0, 0])
                ax2_list = []
                for i in range(len(args.bigwig)):
                    ax2_list.append(plt.subplot(gs[1 + i, 0]))
                # ax2 = plt.subplot(gs[1, 0])
                ax3 = plt.subplot(gs[0, 1])
                bigwig_info['axis'] = ax2_list
                bigwig_info['axis_colorbar'] = ax3
        else:
            axis = plt.subplot(grids[row, col])
            axis.set_title(toString(chrname))
        chrom_range = hic_matrix.getChrBinRange(chrname)
        matrix = np.asarray(hic_matrix.matrix[chrom_range[0]:chrom_range[1],
                                              chrom_range[0]:chrom_range[1]].todense().astype(float))

        norm = None
        if args.log or args.log1p:
            mask = matrix == 0
            mask_nan = np.isnan(matrix)
            mask_inf = np.isinf(matrix)
            log.debug("any nan {}".format(np.isnan(matrix).any()))
            log.debug("any inf {}".format(np.isinf(matrix).any()))

            try:
                matrix[mask] = np.nanmin(matrix[mask == False])
                matrix[mask_nan] = np.nanmin(matrix[mask_nan == False])
                matrix[mask_inf] = np.nanmin(matrix[mask_inf == False])

            except Exception:
                log.debug("Clearing of matrix failed.")
            log.debug("any nanafter remove of nan: {}".format(
                np.isnan(matrix).any()))
            log.debug("any inf after remove of inf: {}".format(
                np.isinf(matrix).any()))
        if args.log1p:
            matrix += 1
            norm = LogNorm()

        elif args.log:
            norm = LogNorm()

        chr_bin_boundary = OrderedDict()
        chr_bin_boundary[chrname] = hic_matrix.get_chromosome_sizes()[chrname]

        args.region = toString(chrname)
        chrom, region_start, region_end, idx1, start_pos1, chrom2, region_start2, region_end2, idx2, start_pos2 = getRegion(
            args, hic_matrix)
        plotHeatmap(matrix, chr_bin_boundary, fig, None,
                    args, cmap, xlabel=chrname, ylabel=chrname,
                    start_pos=start_pos1, start_pos2=start_pos2, pNorm=norm, pAxis=axis, pBigwig=bigwig_info,
                    pChromsomeStartEndDict=chromosome_start_end(hic_matrix), pResolution=pResolution)
    return fig

예제 #21

0

파일 보기

파일: hicCorrectMatrix.py 프로젝트: Rungetf/HiCExplorer

def main(args=None):
    args = parse_arguments().parse_args(args)
    if args.verbose:
        log.setLevel(logging.INFO)

    # args.chromosomes
    if check_cooler(args.matrix) and args.chromosomes is not None and len(args.chromosomes) == 1:
        ma = hm.hiCMatrix(args.matrix, pChrnameList=toString(args.chromosomes))
    else:
        ma = hm.hiCMatrix(args.matrix)

        if args.chromosomes:
            ma.reorderChromosomes(toString(args.chromosomes))

    # mask all zero value bins
    row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten()
    log.info("Removing {} zero value bins".format(sum(row_sum == 0)))
    ma.maskBins(np.flatnonzero(row_sum == 0))
    matrix_shape = ma.matrix.shape
    ma.matrix = convertNansToZeros(ma.matrix)
    ma.matrix = convertInfsToZeros(ma.matrix)

    if 'plotName' in args:
        plot_total_contact_dist(ma, args)
        log.info("Saving diagnostic plot {}\n".format(args.plotName))
        return

    log.info("matrix contains {} data points. Sparsity {:.3f}.".format(
        len(ma.matrix.data),
        float(len(ma.matrix.data)) / (ma.matrix.shape[0] ** 2)))

    if args.skipDiagonal:
        ma.diagflat(value=0)

    outlier_regions = filter_by_zscore(ma, args.filterThreshold[0], args.filterThreshold[1], perchr=args.perchr)
    # compute and print some statistics
    pct_outlier = 100 * float(len(outlier_regions)) / ma.matrix.shape[0]
    ma.printchrtoremove(outlier_regions, label="Bins that are MAD outliers ({:.2f}%) "
                                               "out of".format(pct_outlier, ma.matrix.shape[0]),
                        restore_masked_bins=False)

    assert matrix_shape == ma.matrix.shape
    # mask filtered regions
    ma.maskBins(outlier_regions)
    total_filtered_out = set(outlier_regions)

    if args.sequencedCountCutoff and 0 < args.sequencedCountCutoff < 1:
        chrom, _, _, coverage = zip(*ma.cut_intervals)

        assert type(coverage[0]) == np.float64

        failed_bins = np.flatnonzero(
            np.array(coverage) < args.sequencedCountCutoff)

        ma.printchrtoremove(failed_bins, label="Bins with low coverage", restore_masked_bins=False)
        ma.maskBins(failed_bins)
        total_filtered_out = set(failed_bins)
        """
        ma.matrix, to_remove = fill_gaps(ma, failed_bins)
        log.warning("From {} failed bins, {} could "
                         "not be filled\n".format(len(failed_bins),
                                                  len(to_remove)))
        ma.maskBins(to_remove)
        """

    if args.transCutoff and 0 < args.transCutoff < 100:
        cutoff = float(args.transCutoff) / 100
        # a usual cutoff is 0.05
        ma.truncTrans(high=cutoff)

    pre_row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten()
    correction_factors = []
    if args.perchr:
        corrected_matrix = lil_matrix(ma.matrix.shape)
        # normalize each chromosome independently
        for chrname in list(ma.interval_trees):
            chr_range = ma.getChrBinRange(chrname)
            chr_submatrix = ma.matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]]
            _matrix, _corr_factors = iterative_correction(chr_submatrix, args)
            corrected_matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] = _matrix
            correction_factors.append(_corr_factors)
        correction_factors = np.concatenate(correction_factors)

    else:
        corrected_matrix, correction_factors = iterative_correction(ma.matrix, args)

    ma.setMatrixValues(corrected_matrix)
    ma.setCorrectionFactors(correction_factors)
    log.info("Correction factors {}".format(correction_factors[:10]))
    if args.inflationCutoff and args.inflationCutoff > 0:
        after_row_sum = np.asarray(corrected_matrix.sum(axis=1)).flatten()
        # identify rows that were expanded more than args.inflationCutoff times
        to_remove = np.flatnonzero(after_row_sum / pre_row_sum >= args.inflationCutoff)
        ma.printchrtoremove(to_remove,
                            label="inflated >={} "
                            "regions".format(args.inflationCutoff), restore_masked_bins=False)
        total_filtered_out = total_filtered_out.union(to_remove)

        ma.maskBins(to_remove)

    ma.printchrtoremove(sorted(list(total_filtered_out)),
                        label="Total regions to be removed", restore_masked_bins=False)

    ma.save(args.outFileName, pApplyCorrection=False)

예제 #22

0

파일 보기

파일: hicInfo.py 프로젝트: bgruening/HiCExplorer

def main(args=None):

    args = parse_arguments().parse_args(args)
    for matrix in args.matrices:
        # if
        generated_by = None
        genome_assembly = None
        statistics = None
        generated_by_cooler_lib = None
        tool_url = None
        matrix_generated_by = None
        matrix_generated_by_url = None
        creation_date = None
        bin_length = None
        size = None
        nchroms = None
        num_non_zero = None
        min_non_zero = None
        max_non_zero = None
        sum_elements = None
        num_nan_bins = None

        if check_cooler(matrix) and args.no_metadata:
            cooler_file = cooler.Cooler(matrix)

            if cooler_file.info is not None:
                # log.debug('cooler_file.info {}'.format(cooler_file.info))
                if 'bin-size' in cooler_file.info:
                    bin_length = cooler_file.info['bin-size']
                if 'nbins' in cooler_file.info:
                    size = cooler_file.info['nbins']
                if 'nchroms' in cooler_file.info:
                    nchroms = cooler_file.info['nchroms']
                # if 'chromosomes' in cooler_file.info:
                #     chromosomes = cooler_file.info['chromosomes']
                if 'nnz' in cooler_file.info:
                    num_non_zero = cooler_file.info['nnz']
                if 'min-value' in cooler_file.info:
                    min_non_zero = cooler_file.info['min-value']
                if 'max-value' in cooler_file.info:
                    max_non_zero = cooler_file.info['max-value']
                if 'generated-by' in cooler_file.info:
                    generated_by = toString(cooler_file.info['generated-by'])
                if 'genome-assembly' in cooler_file.info:
                    genome_assembly = toString(
                        cooler_file.info['genome-assembly'])
                if 'metadata' in cooler_file.info:
                    if cooler_file.info['metadata'] is not None:
                        if 'statistics' in cooler_file.info['metadata']:
                            statistics = cooler_file.info['metadata']['statistics']
                if 'generated-by-cooler-lib' in cooler_file.info:
                    generated_by_cooler_lib = toString(
                        cooler_file.info['generated-by-cooler-lib'])
                if 'tool-url' in cooler_file.info:
                    tool_url = toString(cooler_file.info['tool-url'])
                if 'matrix-generated-by' in cooler_file.info:
                    matrix_generated_by = toString(
                        cooler_file.info['matrix-generated-by'])
                if 'matrix-generated-by-url' in cooler_file.info:
                    matrix_generated_by_url = toString(
                        cooler_file.info['matrix-generated-by-url'])
                if 'creation-date' in cooler_file.info:
                    creation_date = cooler_file.info['creation-date']
                if 'sum-elements' in cooler_file.info:
                    sum_elements = cooler_file.info['sum-elements']

                chromosome_sizes = cooler_file.chromsizes

        else:
            hic_ma = hm.hiCMatrix(matrix)
            size = hic_ma.matrix.shape[0]
            num_non_zero = hic_ma.matrix.nnz
            sum_elements = ((hic_ma.matrix.sum() - hic_ma.matrix.diagonal().sum()) / 2) + hic_ma.matrix.diagonal().sum()
            bin_length = hic_ma.getBinSize()
            num_nan_bins = len(hic_ma.nan_bins)
            min_non_zero = hic_ma.matrix.data.min()
            max_non_zero = hic_ma.matrix.data.max()

            # chromosomes = list(hic_ma.chrBinBoundaries)
            chromosome_sizes = hic_ma.get_chromosome_sizes()

        information = StringIO()
        information.write(
            "# Matrix information file. Created with HiCExplorer's hicInfo version {}\n".format(__version__))

        if matrix is not None:
            information.write("File:\t{}\n".format(matrix))
        if creation_date is not None:
            information.write("Date:\t{}\n".format(creation_date))

        if genome_assembly is not None:
            information.write("Genome assembly:\t{}\n".format(genome_assembly))
        if size is not None:
            information.write("Size:\t{:,}\n".format(size))
        if bin_length is not None:
            information.write("Bin_length:\t{}\n".format(bin_length))
        if sum_elements is not None:
            information.write("Sum of matrix:\t{}\n".format(sum_elements))
        # if chromosomes is not None:
        #     information.write("Chromosomes:\t{}\n".format(
        #         ", ".join(toString(chromosomes))))
        information.write("Chromosomes:length: ")
        for key, value in chromosome_sizes.items():
            information.write("{}: {} bp; ".format(key, value))
        information.write('\n')
        if nchroms is not None:
            information.write("Number of chromosomes:\t{}\n".format(nchroms))
        if num_non_zero is not None:
            information.write(
                "Non-zero elements:\t{:,}\n".format(num_non_zero))
        if min_non_zero is not None:
            information.write("Minimum (non zero):\t{}\n".format(min_non_zero))
        if max_non_zero is not None:
            information.write("Maximum:\t{}\n".format(max_non_zero))
        if num_nan_bins is not None:
            information.write("NaN bins:\t{}\n".format(num_nan_bins))

        if check_cooler(matrix):
            information.write('The following columns are available: {}\n'.format(
                cooler.Cooler(matrix).bins().columns.values))
        if generated_by is not None:
            information.write("\n\nGenerated by:\t{}\n".format(generated_by))

        if generated_by_cooler_lib is not None:
            information.write("Cooler library version:\t{}\n".format(
                generated_by_cooler_lib))
        if tool_url is not None:
            information.write("HiCMatrix url:\t{}\n".format(tool_url))
        if matrix_generated_by is not None:
            information.write(
                "Interaction matrix created with:\t{}\n".format(matrix_generated_by))
        if matrix_generated_by_url is not None:
            information.write("URL:\t{}\n".format(matrix_generated_by_url))

        if statistics is not None:
            information.write("\n\nBuild statistics:\n{}\n".format(statistics))

        if args.outFileName:
            with open(args.outFileName, 'w') as file:
                file.write(information.getvalue())
        else:
            print(information.getvalue())

        information.close()