示例#1
0
def loadCooler(cooleruri,
               applyNorm=False,
               norm='weight',
               includeChroms=None,
               nans_to_zero=False):
    '''
    loads a cooler into a csr matrix
    taken from HiCMatrix cool.py see also
    https://github.com/deeptools/HiCMatrix/blob/master/hicmatrix/lib/cool.py

    :param cooleruri:       uri to a given cooler
    :param applyNorm:       if True then the 'norm' is applied to the datapoints in the matrix
    :param norm:            normalization weights to apply if applyNorm is set True
    :param includeChroms:   list of chromosomes to load, if given only the specified chromosomes will be loaded from the cooler

    :return:            data in cooler as scipy.sparse.csr_matrix
    '''
    cooler_file = cooler.Cooler(cooleruri)
    matrix = cooler_file.matrix(balance=norm if applyNorm else False)[:]

    chroms = cooler_file.chromnames
    inds = set()
    for chrom in chroms:
        for binidx in cooler_file.extent(chrom):
            inds.add(binidx)

    inds = sorted(list(inds))

    if includeChroms:
        includechroms = set(includeChroms)
        filterinds, filterchroms = [], []
        for i, chr in zip(range(len(inds)), chroms):
            if chr in includechroms:
                filterinds.append([
                    inds[i],
                    inds[i + 1] if i + 1 != len(inds) else matrix.shape[0]
                ])
                filterchroms.append(chr)

        matrixinds = np.zeros(shape=matrix.shape[0], dtype=bool)
        ncuts, tmpe = [], 0
        for s, e in filterinds:
            matrixinds[s:e] = True

            if s == tmpe:
                ncuts.append(s)
                tmpe = e

            else:
                ncuts.append(tmpe)
                tmpe = e - s + tmpe

        matrix = matrix[matrixinds, :][:, matrixinds]

        inds = ncuts

        chroms = filterchroms

        if nans_to_zero:
            matrix[np.isnan(matrix)] = 0

    return matrix, np.array(inds), np.array(chroms)
示例#2
0
def hic2cool_extractnorms(infile,
                          outfile,
                          exclude_mt=False,
                          show_warnings=False,
                          silent=False):
    """
    Find all normalization vectors in the given hic file at all resolutions and
    attempts to add them to the given cooler file. Does not add any metadata
    to the cooler file. TODO: should we add `extract-norms-date` attr?

    Params:
    <infile> str .hic filename
    <outfile> str .cool output filename
    <exclude_mt> bool. If True, ignore MT contacts. Defaults to False.
    <show_warnings> bool. If True, print out WARNING messages
    <silent> bool. If true, hide standard output
    """
    unit = 'BP'  # only using base pair unit for now
    # Global hic normalization types used
    global NORMS
    NORMS = []
    global WARN
    WARN = False
    req = open(infile, 'rb')
    buf = mmap.mmap(req.fileno(), 0, access=mmap.ACCESS_READ)
    used_chrs, resolutions, masteridx, genome, metadata = read_header(req)
    pair_footer_info, expected, factors, norm_info = read_footer(
        req, buf, masteridx)
    # expected/factors unused for now
    del expected
    del factors

    chr_names = [used_chrs[key][1] for key in used_chrs.keys()]
    if not silent:  # print hic header info for command line usage
        print('################################')
        print('### hic2cool / extract-norms ###')
        print('################################')
        print('Header info from hic:')
        print('... Chromosomes: ', chr_names)
        print('... Resolutions: ', resolutions)
        print('... Normalizations: ', NORMS)
        print('... Genome: ', genome)

    if exclude_mt:  # remove mitchondrial chr by name if this flag is set
        # try to find index of chrM (a.k.a chrMT) if it is present
        mt_names = ['m', 'mt', 'chrm', 'chrmt']
        found_idxs = [
            idx for idx, fv in used_chrs.items() if fv[1].lower() in mt_names
        ]
        if len(found_idxs) == 1:
            excl = used_chrs.pop(found_idxs[0], None)
            if not silent:
                print('... Excluding chromosome %s with index %s' %
                      (excl[1], excl[0]))
        if len(found_idxs) > 1:
            error_str = (
                'ERROR. More than one chromosome was found when attempting to'
                ' exclude MT. Found chromosomes: %s' % chr_names)
            force_exit(error_str, req)
        else:
            if not silent:
                print('... No chromosome found when attempting to exclude MT.')

    # exclude 'all' from chromsomes
    chromosomes = [
        uc[1] for uc in used_chrs.values() if uc[1].lower() != 'all'
    ]
    lengths = [uc[2] for uc in used_chrs.values() if uc[1].lower() != 'all']
    chromsizes = pd.Series(index=chromosomes, data=lengths)

    cooler_groups = {}
    for path in cooler.fileops.list_coolers(outfile):
        binsize = cooler.Cooler(outfile + '::' + path).info['bin-size']
        cooler_groups[binsize] = path
    if not silent:
        print('### Found cooler contents:')
        print('... %s' % cooler_groups)

    for norm in NORMS:
        for binsize in resolutions:
            if binsize not in cooler_groups:
                if not silent:
                    print('... Skip resolution %s; it is not in cooler file' %
                          binsize)
                continue
            if not silent:
                print('... Extracting %s normalization vector at %s BP' %
                      (norm, binsize))
            chrom_map = {}
            bins = cooler.binnify(chromsizes, binsize)
            lengths_in_bins = bins.groupby('chrom').size()
            for chr_val in [
                    uc for uc in used_chrs.values() if uc[1].lower() != 'all'
            ]:
                chr_num_bins = lengths_in_bins.loc[chr_val[1]]
                try:
                    norm_key = norm_info[norm, unit, binsize, chr_val[0]]
                except KeyError:
                    WARN = True
                    if show_warnings and not silent:
                        print_stderr(
                            '!!! WARNING. Normalization vector %s does not exist for %s.'
                            % (norm, chr_val[1]))
                    # add a vector of 0's with length equal to by_chr_bins[chr_idx]
                    norm_vector = [np.nan] * chr_num_bins
                else:
                    norm_vector = read_normalization_vector(
                        req, buf, norm_key)[:chr_num_bins]
                chrom_map[chr_val[1]] = norm_vector

            # hic normalization vector lengths have inconsistent lengths...
            # truncate appropriately
            bins[norm] = np.concatenate(
                [chrom_map[chrom] for chrom in chromosomes])
            if not silent:
                print('... Writing to cool file ...')
                print('%s\n... Truncated ...' % bins.head())
            group_path = cooler_groups[binsize]
            cooler.create.append(outfile + '::' + group_path,
                                 'bins', {norm: bins[norm].values},
                                 force=True)
    req.close()
    if not silent:
        if WARN and not show_warnings:
            print(
                '... Warnings were found in this run. Run with -v to display them.'
            )
        print('### Finished! Output written to: %s' % outfile)
示例#3
0
def adjustMatrix(pArgs):
    if pArgs.chromosomes is not None and pArgs.regions is not None:
        log.error('Please specify either --chromosomes or --regions.')
        exit(1)
    hic_matrix = None
    if pArgs.chromosomes:

        if check_cooler(pArgs.matrix) and len(
                pArgs.chromosomes) == 1 and pArgs.action == 'keep':
            chromosomes_list = cooler.Cooler(pArgs.matrix).chromnames
            if pArgs.chromosomes[0] in chromosomes_list:
                hic_matrix = hm.hiCMatrix(pArgs.matrix,
                                          pChrnameList=pArgs.chromosomes)
            else:
                log.error('Chromosome not available in matrix: {} {}'.format(
                    pArgs.matrix, pArgs.chromosomes[0]))
                exit(1)
        else:
            hic_matrix = hm.hiCMatrix(pArgs.matrix)

        chromosomes_list = list(hic_matrix.chrBinBoundaries)
        chromosomes_list_to_operate_on = []
        log.debug('pArgs.chromosomes {}'.format(pArgs.chromosomes))
        for chromosome in pArgs.chromosomes:
            if chromosome in chromosomes_list:
                chromosomes_list_to_operate_on.append(chromosome)
            else:
                log.warning('Chromosome not available in matrix: {} {}'.format(
                    pArgs.matrix, chromosome))
        log.debug('85: chromosomes_list_to_operate_on {}'.format(
            chromosomes_list_to_operate_on))
        if len(chromosomes_list_to_operate_on) == 0:
            log.error('No valid chromosome given: {}. Available: {}'.format(
                pArgs.chromosomes, chromosomes_list))
            exit(1)
        if pArgs.action == 'keep':
            hic_matrix.reorderChromosomes(chromosomes_list_to_operate_on)
        elif pArgs.action == 'remove':
            log.debug('chromosomes_list {}'.format(chromosomes_list))

            for chromosome in chromosomes_list_to_operate_on:
                if chromosome in chromosomes_list:
                    chromosomes_list.remove(chromosome)
                    # for chromosome in chromosomes_list:
                    #     if chromosome in chromosomes_list_to_operate_on:
                    #         chromosomes_list.remove(chromosome)
                    log.debug('chromosome {}'.format(chromosome))
            log.debug('chromosomes_list {}'.format(chromosomes_list))
            hic_matrix.reorderChromosomes(chromosomes_list)
        elif pArgs.action == 'mask':
            hic_matrix.maskChromosomes(chromosomes_list_to_operate_on)

    elif pArgs.regions:
        hic_matrix = hm.hiCMatrix(pArgs.matrix)
        chromosomes_list = list(hic_matrix.chrBinBoundaries)
        genomic_regions = []
        with open(pArgs.regions, 'r') as file:
            for line in file.readlines():
                _line = line.strip().split('\t')
                log.debug('_line {}'.format(_line))
                if len(line) < 3:
                    log.warning(
                        "An entry shorter than 3 columns has been found!")
                    continue
                if len(_line) >= 3:
                    chrom, start, end = _line[0], int(_line[1]), int(_line[2])
                    log.debug('chrom {}'.format(chrom))
                    if chrom in chromosomes_list:
                        genomic_regions.append((chrom, start, end))
                    else:
                        log.warning('Chromosome not available in matrix, '
                                    'ignoring regions: {} {}'.format(
                                        pArgs.matrix, chrom))

        if len(genomic_regions) == 0:
            log.error('No valid chromosome given. Available: {}'.format(
                chromosomes_list))
            exit(1)
        matrix_indices_regions = []
        for region in genomic_regions:
            log.debug('region {}'.format(region))
            _regionBinRange = hic_matrix.getRegionBinRange(
                region[0], int(region[1]), int(region[2]))
            if _regionBinRange is not None:
                start, end = _regionBinRange
                matrix_indices_regions.extend(list(range(
                    start, end + 1)))  # end is inclusive, so +1
                if pArgs.action == 'remove':
                    chr_start, chr_end = hic_matrix.getChrBinRange(chrom)
                    if (start > chr_start) and (end < chr_end - 1):
                        log.warning(
                            "{}:{}-{} entry may generate discounted regions on a chromosome."
                            "Please consider using `mask` action to deal with that."
                            .format(chrom, start, end))

        if pArgs.action == 'keep':
            hic_matrix.reorderBins(matrix_indices_regions)

        elif pArgs.action == 'mask':
            hic_matrix.maskBins(matrix_indices_regions)

        elif pArgs.action == 'remove':
            hic_matrix.maskBins(matrix_indices_regions)
            hic_matrix.orig_bin_ids = []
            hic_matrix.orig_cut_intervals = []
            hic_matrix.nan_bins = []

    elif pArgs.maskBadRegions:
        if check_cooler(pArgs.matrix) and len(
                pArgs.chromosomes) == 1 and pArgs.action == 'keep':
            hic_matrix = hm.hiCMatrix(pArgs.matrix,
                                      pChrnameList=pArgs.chromosomes)
        else:
            hic_matrix = hm.hiCMatrix(pArgs.matrix)

    else:
        log.info(
            'No data to adjust given. Please specify either --chromosomes or --region parameter.'
        )

    if pArgs.interIntraHandling is not None:
        if pArgs.interIntraHandling == 'inter':
            # iterate over all given chromosomes
            chromosomes_list = list(hic_matrix.chrBinBoundaries)
            for chromosome in chromosomes_list:
                # get start and end bins of that chromosome
                start, end = hic_matrix.getChrBinRange(chromosome)
                # remove inter region
                hic_matrix.matrix[start:end, end:] = 0

        elif pArgs.interIntraHandling == 'intra':
            # iterate over all given chromosomes
            chromosomes_list = list(hic_matrix.chrBinBoundaries)
            for chromosome in chromosomes_list:
                # get start and end bins of that chromosome
                start, end = hic_matrix.getChrBinRange(chromosome)
                # remove intra region
                hic_matrix.matrix[start:end, start:end] = 0
        else:
            log.warning('Option not valid: {}'.format(
                pArgs.interIntraHandling))

    return hic_matrix
示例#4
0
stage = 'Z_50kb'

coolfile = 'P_50kb.cool'
transexpfile = 'P_50kb_trans_expected.tsv'
stage = 'P_50kb'

coolfile = 'D_50kb.cool'
transexpfile = 'D_50kb_trans_expected.tsv'
stage = 'D_50kb'

coolfile = 'MII_50kb.cool'
transexpfile = 'MII_50kb_trans_expected.tsv'
stage = 'MII_50kb'

pileupshape = 500
c = cooler.Cooler(coolfile)

#generate chrom pairs for all autosomes
chroms = c.chromnames
chroms.remove('chrX')
chroms.remove('chrM')
chroms.remove('chrY')

chrompairs = list(itertools.combinations(chroms, 2))

##############################################################################################
#make trans obs pileup
pileup = np.zeros((pileupshape, pileupshape))
n = 0
for pair in chrompairs:
    t1 = c.matrix().fetch(pair[0], pair[1])
示例#5
0
def get_data(f,
             start_pos_1,
             end_pos_1,
             start_pos_2,
             end_pos_2,
             transform='default'):
    """Get balanced pixel data.
 
    Args:
        f: h5py.File
            An HDF5 Group that contains the cooler for this resolution
        start_pos_1 (int): Test.
        end_pos_1 (int): Test.
        start_pos_2 (int): Test.
        end_pos_2 (int): Test.
 
    Returns:
        DataFrame: Annotated cooler pixels.
    """

    c = cooler.Cooler(f)

    (chroms, chrom_sizes,
     chrom_cum_lengths) = get_chromosome_names_cumul_lengths(c)

    i0 = abs_coord_2_bin(c, start_pos_1, chroms, chrom_cum_lengths,
                         chrom_sizes)
    i1 = abs_coord_2_bin(c, end_pos_1, chroms, chrom_cum_lengths, chrom_sizes)

    j0 = abs_coord_2_bin(c, start_pos_2, chroms, chrom_cum_lengths,
                         chrom_sizes)
    j1 = abs_coord_2_bin(c, end_pos_2, chroms, chrom_cum_lengths, chrom_sizes)
    '''
    print('i', i0, i1)
    print('j', j0, j1)
    '''
    matrix = c.matrix(as_pixels=True, balance=False, max_chunk=np.inf)

    if i0 >= matrix.shape[0] or j0 >= matrix.shape[1]:
        # query beyond the bounds of the matrix
        # return an empty matrix
        i0, i1, i1, j1 = 0, 0, 0, 0
    else:
        # limit the range of the query to be within bounds
        i1 = min(i1, matrix.shape[0] - 1)
        j1 = min(j1, matrix.shape[1] - 1)

    #print("size", matrix.shape)

    pixels = matrix[i0:i1 + 1, j0:j1 + 1]

    if not len(pixels):
        return pd.DataFrame(
            columns=['genome_start1', 'genome_start2', 'balanced'])

    # select bin columns to extract
    cols = ['chrom', 'start', 'end']
    if (transform == 'default'
            and 'weight' in c.bins()) or transform == 'weight':
        cols.append('weight')
    elif transform in ('KR', 'VC', 'VC_SQRT'):
        cols.append(transform)

    bins = c.bins(convert_enum=False)[cols]
    pixels = cooler.annotate(pixels, bins)
    pixels['genome_start1'] = chrom_cum_lengths[
        pixels['chrom1']] + pixels['start1']
    pixels['genome_start2'] = chrom_cum_lengths[
        pixels['chrom2']] + pixels['start2']

    # apply transform
    if (transform == 'default'
            and 'weight' in c.bins()) or transform == 'weight':
        pixels['balanced'] = (pixels['count'] * pixels['weight1'] *
                              pixels['weight2'])
        return pixels[['genome_start1', 'genome_start2', 'balanced']]
    elif transform in ('KR', 'VC', 'VC_SQRT'):
        pixels['balanced'] = (pixels['count'] / pixels[transform + '1'] /
                              pixels[transform + '2'])
        return pixels[['genome_start1', 'genome_start2', 'balanced']]
    else:
        return pixels[['genome_start1', 'genome_start2', 'count']]
示例#6
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("coolfile",
                        type=str,
                        help="Cooler file with your Hi-C data")
    parser.add_argument("baselist",
                        type=str,
                        help="""A 3-column bed file or a 6-column double-bed
                        file (i.e. chr1,start1,end1,chr2,start2,end2).
                        Should be tab-delimited.
                        With a bed file, will consider all cis combinations
                        of intervals. To pileup features along the diagonal
                        instead, use the --local argument.
                        Can be piped in via stdin, then use "-".""")
    ##### Extra arguments
    parser.add_argument("--pad",
                        default=100,
                        type=int,
                        required=False,
                        help="""Padding of the windows around the centres of
                        specified features (i.e. final size of the matrix is
                        2×pad+res), in kb.
                        Ignored with --rescale, use --rescale_pad instead.""")
    ### Control of controls
    parser.add_argument("--minshift",
                        default=10**5,
                        type=int,
                        required=False,
                        help="""Shortest distance for randomly shifting
                        coordinates when creating controls""")
    parser.add_argument("--maxshift",
                        default=10**6,
                        type=int,
                        required=False,
                        help="""Longest distance for randomly shifting
                        coordinates when creating controls""")
    parser.add_argument("--nshifts",
                        default=10,
                        type=int,
                        required=False,
                        help="""Number of control regions per averaged
                        window""")
    parser.add_argument("--expected",
                        default=None,
                        type=str,
                        required=False,
                        help="""File with expected (output of
                        cooltools compute-expected). If None, don't use expected
                        and use randomly shifted controls""")
    ### Filtering
    parser.add_argument("--mindist",
                        type=int,
                        required=False,
                        help="""Minimal distance of intersections to use. If
                        not specified, uses --pad as mindist""")
    parser.add_argument("--maxdist",
                        type=int,
                        required=False,
                        help="""Maximal distance of intersections to use""")
    parser.add_argument("--minsize",
                        type=int,
                        required=False,
                        help="""Minimal length of features to use for local
                        analysis""")
    parser.add_argument("--maxsize",
                        type=int,
                        required=False,
                        help="""Maximal length of features to use for local
                        analysis""")
    parser.add_argument("--excl_chrs",
                        default='chrY,chrM',
                        type=str,
                        required=False,
                        help="""Exclude these chromosomes from analysis""")
    parser.add_argument("--incl_chrs",
                        default='all',
                        type=str,
                        required=False,
                        help="""Include these chromosomes; default is all.
                        excl_chrs overrides this.""")
    parser.add_argument("--subset",
                        default=0,
                        type=int,
                        required=False,
                        help="""Take a random sample of the bed file - useful
                        for files with too many featuers to run as is, i.e.
                        some repetitive elements. Set to 0 or lower to keep all
                        data.""")
    ### Modes of action
    parser.add_argument("--anchor",
                        default=None,
                        type=str,
                        required=False,
                        help="""A UCSC-style coordinate to use as an anchor to
                        create intersections with coordinates in the baselist
                        """)
    parser.add_argument("--by_window",
                        action='store_true',
                        default=False,
                        required=False,
                        help="""Create a pile-up for each coordinate in the
                        baselist. Will save a master-table with coordinates,
                        their enrichments and cornerCV, which is reflective of
                        noisiness""")
    parser.add_argument("--save_all",
                        action='store_true',
                        default=False,
                        required=False,
                        help="""If by-window, save all individual pile-ups in a
                        separate json file""")
    parser.add_argument("--local",
                        action='store_true',
                        default=False,
                        required=False,
                        help="""Create local pileups, i.e. along the
                        diagonal""")
    parser.add_argument("--unbalanced",
                        action='store_true',
                        required=False,
                        help="""Do not use balanced data.
                        Useful for single-cell Hi-C data together with
                        --coverage_norm, not recommended otherwise.""")
    parser.add_argument("--coverage_norm",
                        action='store_true',
                        required=False,
                        help="""If --unbalanced, also add coverage
                        normalization based on chromosome marginals""")
    ### Rescaling
    parser.add_argument("--rescale",
                        action='store_true',
                        default=False,
                        required=False,
                        help="""Do not use centres of features and pad, and
                        rather use the actual feature sizes and rescale
                        pileups to the same shape and size""")
    parser.add_argument("--rescale_pad",
                        default=1.0,
                        required=False,
                        type=float,
                        help="""If --rescale, padding in fraction of feature
                        length""")
    parser.add_argument("--rescale_size",
                        type=int,
                        default=99,
                        required=False,
                        help="""If --rescale, this is used to determine the
                        final size of the pileup, i.e. it will be size×size. Due
                        to technical limitation in the current implementation,
                        has to be an odd number""")

    parser.add_argument("--n_proc",
                        default=1,
                        type=int,
                        required=False,
                        help="""Number of processes to use. Each process works
                        on a separate chromosome, so might require quite a bit
                        more memory, although the data are always stored as
                        sparse matrices""")
    ### Output
    parser.add_argument("--outdir",
                        default='.',
                        type=str,
                        required=False,
                        help="""Directory to save the data in""")
    parser.add_argument("--outname",
                        default='auto',
                        type=str,
                        required=False,
                        help="""Name of the output file. If not set, is
                        generated automatically to include important
                        information.""")
    parser.add_argument(
        "-l",
        "--log",
        dest="logLevel",
        choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
        default='INFO',
        help="Set the logging level.")
    args = parser.parse_args()

    logging.basicConfig(level=getattr(logging, args.logLevel))

    logging.info(args)
    if args.n_proc == 0:
        nproc = -1
    else:
        nproc = args.n_proc

    c = cooler.Cooler(args.coolfile)

    if not os.path.isfile(args.baselist) and args.baselist != '-':
        raise FileExistsError("Loop(base) coordinate file doesn't exist")

    coolname = args.coolfile.split('::')[0].split('/')[-1].split('.')[0]
    if args.baselist != '-':
        bedname = args.baselist.split('/')[-1].split('.bed')[0].split(
            '_mm9')[0].split('_mm10')[0]
    else:
        bedname = 'stdin'
        import sys
        args.baselist = sys.stdin
    if args.expected is not None:
        if args.nshifts > 0:
            logging.warning('With specified expected will not use controls')
            args.nshifts = 0
        if not os.path.isfile(args.expected):
            raise FileExistsError("Expected file doesn't exist")
        expected = pd.read_csv(args.expected, sep='\t', header=0)
    else:
        expected = False

    pad = args.pad * 1000 // c.binsize

    if args.mindist is None:
        mindist = pad * c.binsize
    else:
        mindist = args.mindist

    if args.maxdist is None:
        maxdist = np.inf
    else:
        maxdist = args.maxdist

    if args.incl_chrs == 'all':
        incl_chrs = c.chromnames
    else:
        incl_chrs = args.incl_chrs.split(',')

    if args.by_window and args.rescale:
        raise NotImplementedError("""Rescaling with by-window pileups is not
                                  supported""")

    if args.rescale and args.rescale_size % 2 == 0:
        raise ValueError("Please provide an odd rescale_size")

    if args.anchor is not None:
        if '_' in args.anchor:
            anchor, anchor_name = args.anchor.split('_')
            anchor = cooler.util.parse_region_string(anchor)
        else:
            anchor = cooler.util.parse_region_string(args.anchor)
            anchor_name = args.anchor
    else:
        anchor = None

    if anchor:
        fchroms = [anchor[0]]
    else:
        chroms = c.chromnames
        fchroms = []
        for chrom in chroms:
            if chrom not in args.excl_chrs.split(',') and chrom in incl_chrs:
                fchroms.append(chrom)

    bases = pd.read_csv(
        args.baselist,
        sep='\t',
        names=['chr1', 'start1', 'end1', 'chr2', 'start2', 'end2'],
        index_col=False)
    if np.all(pd.isnull(bases[['chr2', 'start2', 'end2']].values)):
        bases = bases[['chr1', 'start1', 'end1']]
        bases.columns = ['chr', 'start', 'end']
        if not np.all(bases['end'] >= bases['start']):
            raise ValueError('Some ends in the file are smaller than starts')
        if args.local:
            if args.minsize is None:
                args.minsize = 0
            if args.maxsize is None:
                args.maxsize = np.inf
            length = bases['end'] - bases['start']
            bases = bases[(length >= args.minsize) & (length <= args.maxsize)]
        combinations = True
        basechroms = set(bases['chr'])
    else:
        if not np.all(bases['chr1'] == bases['chr2']):
            logging.warning(
                "Found inter-chromosomal loci pairs, discarding them")
            bases = bases[bases['chr1'] == bases['chr2']]
        if anchor:
            raise ValueError(
                "Can't use anchor with both sides of loops defined")
        elif args.local:
            raise ValueError(
                "Can't make local with both sides of loops defined")


#        if not np.all(bases['end1']>=bases['start1']) or\
#           not np.all(bases['end2']>=bases['start2']):
#            raise ValueError('Some interval ends in the file are smaller than starts')
#        if not np.all(bases[['start2', 'end2']].mean(axis=1)>=bases[['start1', 'end1']].mean(axis=1)):
#            raise ValueError('Some centres of right ends in the file are\
#                             smaller than centres in the left ends')
        combinations = False
        basechroms = set(bases['chr1']) | set(bases['chr2'])

    fchroms = natsorted(list(set(fchroms) & basechroms))

    if len(fchroms) == 0:
        raise ValueError("""No chromosomes are in common between the coordinate
                         file/anchor and the cooler file. Are they in the same
                         format, e.g. starting with "chr"?""")

    mids = get_mids(bases, resolution=c.binsize, combinations=combinations)
    if args.subset > 0 and args.subset < len(mids):
        mids = mids.sample(args.subset)

    if args.outdir == '.':
        args.outdir = os.getcwd()

    if args.outname == 'auto':
        outname = '%s-%sK_over_%s' % (coolname, c.binsize / 1000, bedname)
        if args.nshifts > 0:
            outname += '_%s-shifts' % args.nshifts
        if args.expected is not None:
            outname += '_expected'
        if args.nshifts <= 0 and args.expected is None:
            outname += '_noNorm'
        if anchor:
            outname += '_from_%s' % anchor_name
        if args.local:
            outname += '_local'
            if args.minsize > 0 or args.maxsize < np.inf:
                outname += '_len_%s-%s' % (args.minsize, args.maxsize)
        elif args.mindist is not None or args.maxdist is not None:
            outname += '_dist_%s-%s' % (mindist, maxdist)
        if args.rescale:
            outname += '_rescaled'
        if args.unbalanced:
            outname += '_unbalanced'
        if args.coverage_norm:
            outname += '_covnorm'
        if args.subset > 0:
            outname += '_subset-%s' % args.subset
        if args.by_window:
            outname = 'Enrichment_%s.txt' % outname
        else:
            outname += '.np.txt'
    else:
        outname = args.outname

    if args.by_window:
        if not combinations:
            raise ValueError(
                "Can't make by-window pileups without making combinations")
        if args.local:
            raise ValueError("Can't make local by-window pileups")
        if anchor:
            raise ValueError(
                "Can't make by-window combinations with an anchor")
        if args.coverage_norm:
            raise NotImplementedError("""Can't make by-window combinations with
                                      coverage normalization - please use
                                      balanced data instead""")
        if args.outname != 'auto':
            logging.warning("Always using autonaming for by-window pileups")

        finloops = pileupsByWindowWithControl(mids=mids,
                                              filename=args.coolfile,
                                              pad=pad,
                                              nproc=nproc,
                                              chroms=fchroms,
                                              minshift=args.minshift,
                                              maxshift=args.maxshift,
                                              nshifts=args.nshifts,
                                              expected=expected,
                                              mindist=mindist,
                                              maxdist=maxdist,
                                              unbalanced=args.unbalanced,
                                              cov_norm=args.coverage_norm,
                                              rescale=args.rescale,
                                              rescale_pad=args.rescale_pad,
                                              rescale_size=args.rescale_size)
        if args.save_all:
            outdict = {
                '%s:%s-%s' % key: (val[0], val[1].tolist())
                for key, val in finloops.items()
            }
            import json
            with open(os.path.join(args.outdir, outname)[:-4] + '.json',
                      'w') as fp:
                json.dump(outdict, fp)  #, sort_keys=True, indent=4)

        p = Pool(nproc)
        data = p.map(prepare_single, finloops.items())
        p.close()
        data = pd.DataFrame(data,
                            columns=[
                                'chr', 'start', 'end', 'N', 'Enrichment1',
                                'Enrichment3', 'CV3', 'CV5'
                            ])
        data = data.reindex(index=order_by_index(
            data.index, index_natsorted(zip(data['chr'], data['start']))))
        try:
            data.to_csv(os.path.join(args.outdir, outname),
                        sep='\t',
                        index=False)
        except FileNotFoundError:
            os.mkdir(args.outdir)
            data.to_csv(os.path.join(args.outdir, outname),
                        sep='\t',
                        index=False)
    else:
        loop = pileupsWithControl(mids=mids,
                                  filename=args.coolfile,
                                  pad=pad,
                                  nproc=nproc,
                                  chroms=fchroms,
                                  local=args.local,
                                  minshift=args.minshift,
                                  maxshift=args.maxshift,
                                  nshifts=args.nshifts,
                                  expected=expected,
                                  mindist=mindist,
                                  maxdist=maxdist,
                                  combinations=combinations,
                                  anchor=anchor,
                                  unbalanced=args.unbalanced,
                                  cov_norm=args.coverage_norm,
                                  rescale=args.rescale,
                                  rescale_pad=args.rescale_pad,
                                  rescale_size=args.rescale_size)
        try:
            np.savetxt(os.path.join(args.outdir, outname), loop)
        except FileNotFoundError:
            try:
                os.mkdir(args.outdir)
            except FileExistsError:
                pass
            np.savetxt(os.path.join(args.outdir, outname), loop)
示例#7
0
def test_align_track_with_cooler(request, tmpdir):

    clr_file = op.join(request.fspath.dirname, "data/sin_eigs_mat.cool")
    clr = cooler.Cooler(clr_file)

    # valid track with three entries that can all be aligned
    track = pd.DataFrame(
        [
            ["chr1", 990, 995, 22],
            ["chr2", 20, 30, -1],
            ["chr3", 0, 10, 0.1],
        ],
        columns=["chrom", "start", "end", "value"],
    )
    assert (~cooltools.lib.align_track_with_cooler(
        track, clr)["value"].isna()).sum() == 3

    # not a track, is not sorted
    track = pd.DataFrame(
        [["chr3", 0, 10, 0.1], ["chr2", 20, 30, -1], ["chr2", 0, 10, 21]],
        columns=["chrom", "start", "end", "value"],
    )
    with pytest.raises(ValueError):
        cooltools.lib.align_track_with_cooler(track, clr)

    # not a track, is overlapping
    track = pd.DataFrame(
        [
            ["chr1", 990, 1000, 22],
            ["chr2", 5, 15, 0.1],
            ["chr2", 20, 30, -1],
        ],
        columns=["chrom", "start", "end", "value"],
    )
    with pytest.raises(ValueError):
        cooltools.lib.align_track_with_cooler(track, clr)

    # bin size mismatch
    track = pd.DataFrame(
        [["chr1", 990, 995, 22], ["chr2", 20, 25, -1], ["chr3", 0, 5, 0.1]],
        columns=["chrom", "start", "end", "value"],
    )
    with pytest.raises(ValueError):
        cooltools.lib.align_track_with_cooler(track, clr)

    # clr_weight_name mismatch
    track = pd.DataFrame(
        [
            ["chr1", 990, 995, 22],
            ["chr2", 20, 30, -1],
            ["chr3", 0, 10, 0.1],
        ],
        columns=["chrom", "start", "end", "value"],
    )
    with pytest.raises(ValueError):
        cooltools.lib.align_track_with_cooler(
            track, clr, clr_weight_name="invalid_weight_name")

    # regions with no assigned values
    track = pd.DataFrame(
        [["chr1", 0, 10, 0.1], ["chr1", 20, 30, -1], ["chr1", 990, 995, 22]],
        columns=["chrom", "start", "end", "value"],
    )
    with pytest.raises(ValueError):
        cooltools.lib.align_track_with_cooler(track, clr)

    # using a restricted view only considers chr1, avoids valueError from no assigned values
    view_df = cooltools.lib.make_cooler_view(clr)
    assert (~cooltools.lib.align_track_with_cooler(
        track, clr, view_df=view_df[:1])["value"].isna()).sum() == 3

    # testing mask_bad_bins option
    clr_file = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool")
    clr = cooler.Cooler(clr_file)
    view_df = cooltools.lib.make_cooler_view(clr)[:1]

    track = pd.DataFrame(
        [["chr1", 0, 1000000, 1], ["chr1", 3000000, 4000000, 10]],
        columns=["chrom", "start", "end", "value"],
    )
    # without masking, both get assigned
    assert (cooltools.lib.align_track_with_cooler(
        track, clr, view_df=view_df, mask_bad_bins=False)["value"].sum() == 11)

    # with masking, only the second value from the track gets assigned
    assert (cooltools.lib.align_track_with_cooler(
        track, clr, view_df=view_df, mask_bad_bins=True)["value"].sum() == 10)
示例#8
0
def call_dots(
        cool_path,
        expected_path,
        expected_name,
        nproc,
        max_loci_separation,
        max_nans_tolerated,
        tile_size,
        fdr,
        dots_clustering_radius,
        verbose,
        output_scores,
        output_calls):
    """
    Call dots on a Hi-C heatmap that are not larger than max_loci_separation.
    
    COOL_PATH : The paths to a .cool file with a balanced Hi-C map.

    EXPECTED_PATH : The paths to a tsv-like file with expected signal.

    Analysis will be performed for chromosomes referred to in EXPECTED_PATH, and
    therefore these chromosomes must be a subset of chromosomes referred to in
    COOL_PATH. Also chromosomes refered to in EXPECTED_PATH must be non-trivial,
    i.e., contain not-NaN signal. Thus, make sure to prune your EXPECTED_PATH
    before applying this script.

    COOL_PATH and EXPECTED_PATH must be binned at the same resolution.

    EXPECTED_PATH must contain at least the following columns for cis contacts:
    'chrom', 'diag', 'n_valid', value_name. value_name is controlled using
    options. Header must be present in a file.

    """
    clr = cooler.Cooler(cool_path)

    # read expected and make preparations for validation,
    # that's what we expect as column names:
    expected_columns = ['chrom', 'diag', 'n_valid', expected_name]
    # what would become a MultiIndex:
    expected_index = ['chrom', 'diag']
    # expected dtype as a rudimentary form of validation:
    expected_dtype = {
        'chrom': np.str, 
        'diag': np.int64, 
        'n_valid': np.int64, 
        expected_name: np.float64
    }
    # unique list of chroms mentioned in expected_path:
    get_exp_chroms = lambda df: df.index.get_level_values("chrom").unique()
    # compute # of bins by comparing matching indexes:
    get_exp_bins = lambda df, ref_chroms: (
        df.index.get_level_values("chrom").isin(ref_chroms).sum())
    # use 'usecols' as a rudimentary form of validation,
    # and dtype. Keep 'comment' and 'verbose' - explicit,
    # as we may use them later:
    expected = pd.read_table(
        expected_path,
        usecols=expected_columns,
        index_col=expected_index,
        dtype=expected_dtype,
        comment=None,
        verbose=verbose)

    #############################################
    # CROSS-VALIDATE COOLER and EXPECTED:
    #############################################
    # EXPECTED vs COOLER:
    # chromosomes to deal with 
    # are by default extracted
    # from the expected-file:
    expected_chroms = get_exp_chroms(expected)
    # do simple column-name validation for now:
    if not set(expected_chroms).issubset(clr.chromnames):
        raise ValueError(
            "Chromosomes in {} must be subset of ".format(expected_path) +
            "chromosomes in cooler {}".format(cool_path))
    # check number of bins:
    expected_bins = get_exp_bins(expected, expected_chroms)
    cool_bins   = clr.bins()[:]["chrom"].isin(expected_chroms).sum()
    if not (expected_bins == cool_bins):
        raise ValueError(
            "Number of bins is not matching:",
            " {} in {}, and {} in {} for chromosomes {}".format(expected_bins,
                                                                expected_path,
                                                                cool_bins,
                                                                cool_path,
                                                                expected_chroms))
    if verbose:
        print("{} and {} passed cross-compatibility checks.".format(
            cool_path, expected_path))

    # prepare some parameters:
    # turn them from nucleotides dims to bins, etc.:
    binsize = clr.binsize
    loci_separation_bins = int(max_loci_separation/binsize)
    tile_size_bins = int(tile_size/binsize)
    # # clustering would deal with bases-units for now, so
    # # supress this for now:
    # clustering_radius_bins = int(dots_clustering_radius/binsize)
    
    ktypes = ['donut', 'vertical', 'horizontal', 'lowleft', 'upright']

    # define kernel parameteres based on the cooler resolution:
    if binsize > 28000:
        # > 30 kb - is probably too much ...
        raise ValueError("Provided cooler {} has resolution {} bases,\
                         which is too low for analysis.".format(cool_path, binsize))
    elif binsize >= 18000:
        # ~ 20-25 kb:
        w, p = 3, 1
    elif binsize >= 8000:
        # ~ 10 kb
        w, p = 5, 2
    elif binsize >= 4000:
        # ~5 kb
        w, p = 7, 4
    else:
        # < 5 kb - is probably too fine ...
        raise ValueError("Provided cooler {} has resolution {} bases, \
                        which is too fine for analysis.".format(cool_path, binsize))
    # rename w, p to wid, pix probably, or _w, _p to avoid naming conflicts ...
    if verbose:
        print("Kernels parameters are set as w,p={},{}"
              " for the cooler with {} bp resolution.".format(w,p,binsize))

    kernels = {k: get_kernel(w,p,k) for k in ktypes}

    tiles = list(
        dotfinder.heatmap_tiles_generator_diag(
            clr, 
            expected_chroms, 
            w, 
            tile_size_bins, 
            loci_separation_bins
        )
    )

    scoring_step(clr, expected, expected_name, tiles, kernels, 
                 max_nans_tolerated, loci_separation_bins, output_scores, 
                 nproc, verbose)

    if verbose:
        print("Subsequent clustering and thresholding steps are not production-ready")

    if False:
        centroids = clustering_step(scores_file, expected_chroms, ktypes, fdr, 
                                    dots_clustering_radius, verbose)
        ###################################
        # everyhting works up until here ...
        ###################################
        thresholding_step(centroids, output_calls)
示例#9
0
def main(args):

    from sklearn.externals import joblib
    import gc, pathlib
    import numpy as np
    from peakachu import trainUtils, utils

    np.seterr(divide='ignore',invalid='ignore')
    
    pathlib.Path(args.output).mkdir(parents=True, exist_ok=True)

    hic_info = utils.read_hic_header(args.path) # more robust to check if a file is .hic

    if hic_info is None:
        hic=False
    else:
        hic=True

    coords = trainUtils.parsebed(args.bedpe,lower=2,res=args.resolution)
    kde, lower, long_start, long_end = trainUtils.learn_distri_kde(coords)

    if not hic:
        import cooler
        Lib = cooler.Cooler(args.path)
        chromosomes = Lib.chromnames[:]
    else:
        chromosomes = list(hic_info['chromsizes'])

    # train model per chromosome
    positive_class = {}
    negative_class = {}
    for key in chromosomes:
        if key.startswith('chr'):
            chromname=key
        else:
            chromname='chr'+key
        print('collecting from {}'.format(key))
        if not hic:
            X = Lib.matrix(balance=args.balance,sparse=True).fetch(key).tocsr()
        else:
            if args.balance:
                X = utils.csr_contact_matrix('KR',args.path,key,key,'BP',args.resolution)
            else:
                X = utils.csr_contact_matrix('NONE',args.path,key,key,'BP',args.resolution)
        clist = coords[chromname]

        try:
            positive_class[chromname] = np.vstack((f for f in trainUtils.buildmatrix(
                                             X,clist,width=args.width)))
            neg_coords = trainUtils.negative_generating(X, kde, clist, lower, long_start, long_end)
            stop = len(clist)
            negative_class[chromname]=np.vstack((f for f in trainUtils.buildmatrix(
                             X,neg_coords,width=args.width,
                             positive=False,stop=stop)))
        except:
            print(chromname, ' failed to gather fts')

    for key in chromosomes:
        if key.startswith('chr'):
            chromname=key
        else:
            chromname='chr'+key
 
        Xtrain = np.vstack((v for k,v in positive_class.items() if k!=chromname))
        Xfake = np.vstack((v for k,v in negative_class.items() if k!=chromname))
        print(chromname,'pos/neg: ',Xtrain.shape[0],Xfake.shape[0])
        model = trainUtils.trainRF(Xtrain,Xfake)

        joblib.dump(model, args.output+'/'+chromname+'.pkl', compress=('xz',3))
示例#10
0
def main(args=None):

    args = parse_arguments().parse_args(args)
    short_v_long_range = []
    sum_smaller = []
    sum_greater = []
    for matrix in args.matrices:

        is_cooler = check_cooler(matrix)
        if not is_cooler:
            hic_matrix = hm.hiCMatrix(matrix)
        else:
            hic_matrix = matrix
        if args.chromosomes is None:
            # get all chromosomes from cooler file
            if not is_cooler:
                chromosomes_list = list(hic_matrix.chrBinBoundaries)
            else:
                chromosomes_list = cooler.Cooler(matrix).chromnames
        else:
            chromosomes_list = args.chromosomes

        short_v_long_range_matrix_threads = [None] * args.threads
        sum_smaller_threads = [None] * args.threads
        sum_greater_threads = [None] * args.threads

        chromosomesListPerThread = len(chromosomes_list) // args.threads
        all_data_collected = False
        queue = [None] * args.threads
        process = [None] * args.threads
        thread_done = [False] * args.threads
        for i in range(args.threads):

            if i < args.threads - 1:
                chromosomeListThread = chromosomes_list[i * chromosomesListPerThread:(i + 1) * chromosomesListPerThread]
            else:
                chromosomeListThread = chromosomes_list[i * chromosomesListPerThread:]

            queue[i] = Queue()
            process[i] = Process(target=compute_relation_short_long_range, kwargs=dict(
                pHiCMatrix=hic_matrix,
                pChromosomes=chromosomeListThread,
                pDistance=args.distance,
                pIsCooler=is_cooler,
                pQueue=queue[i]
            )
            )

            process[i].start()

        while not all_data_collected:
            for i in range(args.threads):
                if queue[i] is not None and not queue[i].empty():
                    short_v_long_range_matrix_threads[i], sum_smaller_threads[i], sum_greater_threads[i] = queue[i].get()
                    queue[i] = None
                    process[i].join()
                    process[i].terminate()
                    process[i] = None
                    thread_done[i] = True
            all_data_collected = True
            for thread in thread_done:
                if not thread:
                    all_data_collected = False
            time.sleep(1)

        short_v_long_range_matrix = [item for sublist in short_v_long_range_matrix_threads for item in sublist]
        sum_smaller_matrix = [item for sublist in sum_smaller_threads for item in sublist]
        sum_greater_matrix = [item for sublist in sum_greater_threads for item in sublist]

        short_v_long_range.append(short_v_long_range_matrix)
        sum_smaller.append(sum_smaller_matrix)
        sum_greater.append(sum_greater_matrix)

    log.debug(short_v_long_range)
    plt.ylabel('Sum short range / long range')
    plt.tick_params(
        axis='x',
        which='both',
        bottom=False,
        top=False,
        labelbottom=False)

    box_plot = plt.boxplot(short_v_long_range, patch_artist=True)
    legend_handels_color = []
    for i, patch in enumerate(box_plot['boxes']):
        patch.set_facecolor(args.colorList[i % len(args.colorList)])
        legend_handels_color.append(mpatches.Patch(color=args.colorList[i % len(args.colorList)], label=args.matrices[i].split('/')[-1]))
    plt.legend(handles=legend_handels_color)
    plt.xlabel('Boxplot shows svl-ratio per chromosome.')
    plt.savefig(args.plotFileName, dpi=args.dpi)

    if len(args.matrices) > 1:
        p_values = []
        for i, sample in enumerate(short_v_long_range):
            for sample2 in short_v_long_range[i + 1:]:
                statistic, significance_level = ranksums(sample, sample2)
                p_values.append(significance_level)
        log.debug('p_values {}'.format(p_values))
        with open(args.outFileName, 'w') as file:
            header = '# Created with HiCExplorer\'s hicPlotSVL ' + __version__ + '\n'
            header += "# Short range vs long range contacts per chromosome, p-values of each distribution against each other distribution with Wilcoxon rank-sum\n"
            header += '# Short range contacts: <= ' + str(args.distance) + '\n'
            file.write(header)
            counter = 0
            for i, matrix_0 in enumerate(args.matrices):
                for j, matrix_1 in enumerate(args.matrices[i + 1:]):
                    file.write(matrix_0 + '\t' + matrix_1 + '\t' + str(p_values[counter]) + '\n')
                    counter += 1

    with open(args.outFileNameData, 'w') as file:
        header = '# Created with HiCExplorer\'s hicPlotSVL ' + __version__ + '\n'
        header += "# Short range vs long range contacts per chromosome: raw data\n"
        header += '# Short range contacts: <= ' + str(args.distance) + '\n'
        matrices_names = '\t\t\t'.join(args.matrices)
        header += '#\t{}\n'.format(matrices_names)
        header += '# Chromosome\t'
        header += '\t'.join(['Ratio', 'Sum <= {}'.format(args.distance), 'Sum > {}'.format(args.distance)] * len(args.matrices))
        header += '\n'
        file.write(header)
        counter = 0
        for i, chromosome in enumerate(chromosomes_list):
            file.write('{}\t'.format(chromosome))
            for j, matrix in enumerate(args.matrices):
                if i < len(short_v_long_range[j]):
                    file.write('{}\t{}\t{}\t'.format(short_v_long_range[j][i], sum_smaller[j][i], sum_greater[j][i]))
                else:
                    file.write('\t')

            file.write('\n')
示例#11
0
def test_is_valid_expected(request, tmpdir):

    expected_file = op.join(request.fspath.dirname,
                            "data/CN.mm9.toy_expected.tsv")
    expected_df = pd.read_csv(expected_file, sep="\t")

    # false because need to specify that this expected only has balanced.avg not count.avg
    assert cooltools.lib.checks.is_valid_expected(expected_df, "cis") is False

    # true, because passing expected_value_cols that match what is in the expected table
    assert cooltools.lib.checks.is_valid_expected(
        expected_df, "cis", expected_value_cols=["balanced.avg"])

    expected_df_incompat = expected_df.copy()
    expected_df_incompat.drop("dist", axis=1, inplace=True)
    # false, because this is cis expected and there is no dist column
    with pytest.raises(ValueError):
        cooltools.lib.is_valid_expected(
            expected_df_incompat,
            "cis",
            verify_view=None,
            expected_value_cols=["balanced.avg"],
            raise_errors=True,
        )

    # raises a value error because non-unique region pairs
    with pytest.raises(ValueError):
        cooltools.lib.is_valid_expected(
            expected_df_incompat,
            "trans",
            verify_view=None,
            expected_value_cols=["balanced.avg"],
            raise_errors=True,
        )

    # raises a value error because the contact type is not cis or trans
    with pytest.raises(ValueError):
        cooltools.lib.is_valid_expected(
            expected_df,
            "other",
            verify_view=None,
            expected_value_cols=["balanced.avg"],
            raise_errors=True,
        )

    # raises a value error because input is not a dataframe
    with pytest.raises(ValueError):
        cooltools.lib.is_valid_expected(
            expected_df.values,
            "other",
            verify_view=None,
            expected_value_cols=["balanced.avg"],
            raise_errors=True,
        )

    # raise error w/ old column names
    expected_df_incompat = expected_df.copy()
    expected_df_incompat.rename(columns={"region1": "region"}, inplace=True)
    with pytest.raises(ValueError):
        cooltools.lib.is_valid_expected(
            expected_df_incompat,
            "cis",
            verify_view=None,
            expected_value_cols=["balanced.avg"],
            raise_errors=True,
        )

    # alternate method of loading:
    expected_df = cooltools.lib.read_expected_from_file(
        expected_file, expected_value_cols=["balanced.avg"])

    ### testing expected compatibility with a view as well
    view_file = op.join(request.fspath.dirname, "data/CN.mm9.toy_regions.bed")
    view_df = cooltools.lib.read_viewframe_from_file(view_file)

    # true, because this view has regions named foo and bar, just like the expected table
    assert cooltools.lib.is_valid_expected(
        expected_df,
        "cis",
        verify_view=view_df,
        expected_value_cols=["balanced.avg"])

    view_df_incompatible = view_df.copy()
    view_df_incompatible["name"] = ["totally", "wrong"]
    # false, because of mismatching view region names
    with pytest.raises(ValueError):
        cooltools.lib.is_valid_expected(
            expected_df,
            "cis",
            verify_view=view_df_incompatible,
            expected_value_cols=["balanced.avg"],
            raise_errors=True,
        )

    # tests with sin_eigs_mat cooler
    cooler_file = op.join(request.fspath.dirname, "data/sin_eigs_mat.cool")
    clr = cooler.Cooler(cooler_file)
    exp_cis = cooltools.expected_cis(clr)

    # cis with no verify_view should work!
    assert cooltools.lib.is_valid_expected(exp_cis,
                                           "cis",
                                           verify_view=None,
                                           verify_cooler=clr,
                                           raise_errors=True)
    # tests with sin_eigs_mat cooler and custom armwise view as input
    view_df = pd.DataFrame(
        [
            ["chr1", 0, 500, "chr1L"],
            ["chr1", 500, 1000, "chr1R"],
            ["chr2", 0, 1000, "chr2L"],
            ["chr2", 1000, 2000, "chr2R"],
            ["chr3", 0, 1500, "chr3L"],
            ["chr3", 1500, 3000, "chr3R"],
        ],
        columns=["chrom", "start", "end", "name"],
    )

    exp_cis = cooltools.expected_cis(clr, view_df=view_df[:1])

    # cis with intra_only=True does not raise ValueError with swapped region columns
    # because region names are identical
    assert cooltools.lib.is_valid_expected(
        exp_cis.rename(columns={
            "region1": "region2",
            "region2": "region1"
        }),
        "cis",
        verify_view=view_df[:1],
        verify_cooler=clr,
    )

    # cis that is shortened does not have enough diagonals
    with pytest.raises(ValueError):
        cooltools.lib.is_valid_expected(
            exp_cis[::2],
            "cis",
            verify_view=view_df[:1],
            verify_cooler=clr,
            raise_errors=True,
        )

    exp_cis = cooltools.expected_cis(clr,
                                     view_df=view_df[:2],
                                     intra_only=False)
    # cis with intra_only=False raises ValueError with swapped region columns,
    # because this tries to query lower triangular part of cooler
    with pytest.raises(ValueError):
        cooltools.lib.is_valid_expected(
            exp_cis.rename(columns={
                "region1": "region2",
                "region2": "region1"
            }),
            "cis",
            verify_view=view_df[:2],
            verify_cooler=clr,
            raise_errors=True,
        )

    # trans raises ValueError with swapped region columns
    exp_trans = cooltools.expected_trans(clr, view_df=view_df)
    with pytest.raises(ValueError):
        cooltools.lib.is_valid_expected(
            exp_trans.rename(columns={
                "region1": "region2",
                "region2": "region1"
            }),
            "trans",
            verify_view=view_df[::-1],
            verify_cooler=clr,
            raise_errors=True,
        )
示例#12
0
def call_dots(
    cool_path,
    expected_path,
    regions,
    expected_name,
    weight_name,
    nproc,
    max_loci_separation,
    max_nans_tolerated,
    tile_size,
    kernel_width,
    kernel_peak,
    num_lambda_chunks,
    fdr,
    dots_clustering_radius,
    verbose,
    out_prefix,
):
    """
    Call dots on a Hi-C heatmap that are not larger than max_loci_separation.

    COOL_PATH : The paths to a .cool file with a balanced Hi-C map.

    EXPECTED_PATH : The paths to a tsv-like file with expected cis-expected.

    Analysis will be performed for chromosomes referred to in EXPECTED_PATH, and
    therefore these chromosomes must be a subset of chromosomes referred to in
    COOL_PATH. Also chromosomes refered to in EXPECTED_PATH must be non-trivial,
    i.e., contain not-NaN signal. Thus, make sure to prune your EXPECTED_PATH
    before applying this script.

    COOL_PATH and EXPECTED_PATH must be binned at the same resolution.

    EXPECTED_PATH must contain at least the following columns for cis contacts:
    'region', 'diag', 'n_valid', value_name. value_name is controlled using
    options. Header must be present in a file.

    """
    clr = cooler.Cooler(cool_path)

    # preliminary SCHEMA for cis-expected
    region_column_name = "region"
    expected_columns = [region_column_name, "diag", "n_valid", expected_name]
    expected_dtypes = {
        region_column_name: np.str,
        "diag": np.int64,
        "n_valid": np.int64,
        expected_name: np.float64,
    }

    try:
        expected = pd.read_table(
            expected_path,
            usecols=expected_columns,
            dtype=expected_dtypes,
            comment=None,
            verbose=verbose,
        )
    except ValueError as e:
        raise ValueError(
            "input expected does not match the schema\n"
            "tab-separated expected file must have a header as wel")
    expected_index = [
        region_column_name,
        "diag",
    ]
    expected.set_index(expected_index, inplace=True)
    # end of SCHEMA for cis-expected

    # Optional reading region table provided by the user:
    if regions is None:
        try:
            uniq_regions = expected.index.get_level_values(
                region_column_name).unique()
            regions_table = bioframe.parse_regions(uniq_regions,
                                                   clr.chromsizes)
            regions_table["name"] = regions_table["chrom"]
        except ValueError as e:
            print(e)
            raise ValueError(
                "Cannot interpret regions from EXPECTED_PATH\n"
                "specify regions definitions using --regions option.")
    else:
        # Flexible reading of the regions table:
        regions_buf, names = util.sniff_for_header(regions)
        regions_table = pd.read_csv(regions_buf, sep="\t", header=None)
        if regions_table.shape[1] not in (3, 4):
            raise ValueError(
                "The region file does not have three or four tab-delimited columns."
                "We expect a bed file with columns chrom, start, end, and optional name"
            )
        if regions_table.shape[1] == 4:
            regions_table = regions_table.rename(columns={
                0: "chrom",
                1: "start",
                2: "end",
                3: "name"
            })
            regions_table = bioframe.parse_regions(regions_table)
        else:
            regions_table = regions_table.rename(columns={
                0: "chrom",
                1: "start",
                2: "end"
            })
            regions_table = bioframe.parse_regions(regions_table)
        regions_table = regions_table[regions_table["chrom"].isin(
            clr.chromnames)].reset_index(drop=True)

    # Verify appropriate columns order (required for heatmap_tiles_generator_diag):
    regions_table = regions_table[["chrom", "start", "end", "name"]]

    # Input validation
    get_exp_regions = lambda df: df.index.get_level_values(region_column_name
                                                           ).unique()
    expected_regions = get_exp_regions(expected)

    # unique list of regions mentioned in expected_path
    # are also in regions table
    if not set(expected_regions).issubset(regions_table["name"]):
        raise ValueError(
            "Regions in {} must be subset of ".format(expected_path) +
            f"regions in {'regions table'+regions_path if not regions_path is None else 'cooler'}"
        )

    # check number of bins per region in cooler and expected table
    # compute # of bins by comparing matching indexes
    try:
        for region_name, group in expected.reset_index().groupby(
                region_column_name):
            n_diags = group.shape[0]
            region = regions_table.set_index("name").loc[region_name]
            lo, hi = clr.extent(region)
            assert n_diags == (hi - lo)
    except AssertionError:
        raise ValueError("Region shape mismatch between expected and cooler. "
                         "Are they using the same resolution?")
    # All the checks have passed:
    if verbose:
        print("{} and {} passed cross-compatibility checks.".format(
            cool_path, expected_path))

    # by now we have a usable region_table and expected for most scenarios

    # Prepare some parameters.
    binsize = clr.binsize
    loci_separation_bins = int(max_loci_separation / binsize)
    tile_size_bins = int(tile_size / binsize)
    balance_factor = 1.0  # clr._load_attrs("bins/weight")["scale"]

    # clustering would deal with bases-units for now, so supress this for now
    # clustering_radius_bins = int(dots_clustering_radius/binsize)

    # kernels
    # 'upright' is a symmetrical inversion of "lowleft", not needed.
    ktypes = ["donut", "vertical", "horizontal", "lowleft"]

    if (kernel_width is None) or (kernel_peak is None):
        w, p = dotfinder.recommend_kernel_params(binsize)
        print(
            f"Using kernel parameters w={w}, p={p} recommended for binsize {binsize}"
        )
    else:
        w, p = kernel_width, kernel_peak
        # add some sanity check for w,p:
        assert w > p, f"Wrong inner/outer kernel parameters w={w}, p={p}"
        print(f"Using kernel parameters w={w}, p={p} provided by user")

    # once kernel parameters are setup check max_nans_tolerated
    # to make sure kernel footprints overlaping 1 side with the
    # NaNs filled row/column are not "allowed"
    # this requires dynamic adjustment for the "shrinking donut"
    assert max_nans_tolerated <= 2 * w, "Too many NaNs allowed!"
    # may lead to scoring the same pixel twice, - i.e. duplicates.

    # generate standard kernels - consider providing custom ones
    kernels = {k: dotfinder.get_kernel(w, p, k) for k in ktypes}

    # list of tile coordinate ranges
    tiles = list(
        dotfinder.heatmap_tiles_generator_diag(clr, regions_table, w,
                                               tile_size_bins,
                                               loci_separation_bins))

    # lambda-chunking edges ...
    assert dotfinder.HiCCUPS_W1_MAX_INDX <= num_lambda_chunks <= 50
    base = 2**(1 / 3)
    ledges = np.concatenate((
        [-np.inf],
        np.logspace(
            0,
            num_lambda_chunks - 1,
            num=num_lambda_chunks,
            base=base,
            dtype=np.float,
        ),
        [np.inf],
    ))

    # 1. Calculate genome-wide histograms of scores.
    gw_hist = dotfinder.scoring_and_histogramming_step(
        clr,
        expected,
        expected_name,
        weight_name,
        tiles,
        kernels,
        ledges,
        max_nans_tolerated,
        loci_separation_bins,
        nproc,
        verbose,
    )

    if verbose:
        print("Done building histograms ...")

    # 2. Determine the FDR thresholds.
    threshold_df, qvalues = dotfinder.determine_thresholds(
        kernels, ledges, gw_hist, fdr)

    # 3. Filter using FDR thresholds calculated in the histogramming step
    filtered_pixels = dotfinder.scoring_and_extraction_step(
        clr,
        expected,
        expected_name,
        weight_name,
        tiles,
        kernels,
        ledges,
        threshold_df,
        max_nans_tolerated,
        balance_factor,
        loci_separation_bins,
        op.join(op.dirname(out_prefix),
                op.basename(out_prefix) + ".enriched.tsv"),
        nproc,
        verbose,
        bin1_id_name="bin1_id",
        bin2_id_name="bin2_id",
    )

    # 4. Post-processing
    if verbose:
        print(
            f"Begin post-processing of {len(filtered_pixels)} filtered pixels")
        print("preparing to extract needed q-values ...")

    filtered_pixels_qvals = dotfinder.annotate_pixels_with_qvalues(
        filtered_pixels, qvalues, kernels)
    # 4a. clustering
    ########################################################################
    # Clustering has to be done using annotated DataFrame of filtered pixels
    # why ? - because - clustering has to be done independently for every region!
    ########################################################################
    filtered_pixels_annotated = cooler.annotate(filtered_pixels_qvals,
                                                clr.bins()[:])
    filtered_pixels_annotated = assign_regions(filtered_pixels_annotated,
                                               regions_table)
    # consider reseting index here
    centroids = dotfinder.clustering_step(filtered_pixels_annotated,
                                          expected_regions,
                                          dots_clustering_radius, verbose)

    # 4b. filter by enrichment and qval
    postprocessed_calls = dotfinder.thresholding_step(centroids)

    # Final-postprocessed result
    if out_prefix is not None:

        postprocessed_fname = op.join(
            op.dirname(out_prefix),
            op.basename(out_prefix) + ".postproc.bedpe")

        postprocessed_calls.to_csv(postprocessed_fname,
                                   sep="\t",
                                   header=True,
                                   index=False,
                                   compression=None)
示例#13
0
def sample_cooler(clr, out_clr_path, count=None, frac=None, exact=False,
                  map_func=map, chunksize=int(1e7)):
    """
    Pick a random subset of contacts from a Hi-C map.

    Parameters
    ----------
    clr : cooler.Cooler or str
        A Cooler or a path/URI to a Cooler with input data.

    out_clr_path : str
        A path/URI to the output.

    count : float
        The target number of contacts in the sample.
        Mutually exclusive with `frac`.

    frac : float
        The target sample size as a fraction of contacts in the original
        dataset. Mutually exclusive with `count`.

    exact : bool
        If True, the resulting sample size will exactly match the target value.
        Exact sampling will load the whole pixel table into memory!
        If False, binomial sampling will be used instead and the sample size
        will be randomly distributed around the target value.

    map_func : function
        A map implementation.

    chunksize : int
        The number of pixels loaded and processed per step of computation.

    """
    if issubclass(type(clr), str):
        clr = cooler.Cooler(clr)

    if count is not None and frac is None:
        frac = count / clr.info['sum']
    elif count is None and frac is not None:
        count = np.round(frac * clr.info['sum'])
    else:
        raise ValueError('Either frac or tot_count must be specified!')

    if frac >= 1.0:
        raise ValueError('The number of contacts in a sample cannot exceed '
                         'that in the original dataset.')

    if exact:
        pixels = sample_pixels_exact(clr.pixels()[:], count)
        cooler.create_cooler(
            out_clr_path, clr.bins()[:], pixels, ordered=True)

    else:
        pipeline = (
            cooler.tools.split(clr,
                               include_bins=False,
                               map=map_func,
                               chunksize=chunksize)
            .pipe(_extract_pixel_chunk)
            .pipe(sample_pixels_approx, frac=frac)
        )

        cooler.create_cooler(
            out_clr_path, clr.bins()[:], iter(pipeline), ordered=True)
示例#14
0
def call_compartments(
    cool_path,
    reference_track,
    view,
    contact_type,
    n_eigs,
    verbose,
    out_prefix,
    bigwig,
):
    """
    Perform eigen value decomposition on a cooler matrix to calculate
    compartment signal by finding the eigenvector that correlates best with the
    phasing track.


    COOL_PATH : the paths to a .cool file with a balanced Hi-C map. Use the
    '::' syntax to specify a group path in a multicooler file.

    TRACK_PATH : the path to a BedGraph-like file that stores phasing track as
    track-name named column.

    BedGraph-like format assumes tab-separated columns chrom, start, stop and
    track-name.

    """
    clr = cooler.Cooler(cool_path)

    if reference_track is not None:

        # TODO: This all needs to be refactored into a more generic tabular file parser
        # Needs to handle stdin case too.
        track_path, col = reference_track
        buf, names = sniff_for_header(track_path)

        if names is None:
            if not isinstance(col, int):
                raise click.BadParameter(
                    "No header found. "
                    'Cannot find "{}" column without a header.'.format(col))

            track_name = "ref"
            kwargs = dict(
                header=None,
                usecols=[0, 1, 2, col],
                names=["chrom", "start", "end", track_name],
            )
        else:
            if isinstance(col, int):
                try:
                    col = names[col]
                except IndexError:
                    raise click.BadParameter(
                        'Column #{} not compatible with header "{}".'.format(
                            col, ",".join(names)))
            else:
                if col not in names:
                    raise click.BadParameter(
                        'Column "{}" not found in header "{}"'.format(
                            col, ",".join(names)))

            track_name = col
            kwargs = dict(header="infer",
                          usecols=["chrom", "start", "end", track_name])

        track_df = pd.read_table(buf,
                                 dtype={
                                     "chrom": str,
                                     "start": np.int64,
                                     "end": np.int64,
                                     track_name: np.float64,
                                 },
                                 comment="#",
                                 verbose=verbose,
                                 **kwargs)

        # we need to merge phasing track DataFrame with the cooler bins to get
        # a DataFrame with phasing info aligned and validated against bins inside of
        # the cooler file.
        track = pd.merge(left=clr.bins()[:],
                         right=track_df,
                         how="left",
                         on=["chrom", "start", "end"])

        # sanity check would be to check if len(bins) becomes > than nbins ...
        # that would imply there was something in the track_df that didn't match
        # ["chrom", "start", "end"] - keys from the c.bins()[:] .
        if len(track) > len(clr.bins()):
            ValueError(
                "There is something in the {} that ".format(track_path) +
                "couldn't be merged with cooler-bins {}".format(cool_path))
    else:
        # use entire bin-table from cooler, when reference-track is not provided:
        track = clr.bins()[["chrom", "start", "end"]][:]
        track_name = None

    # define view for cis compartment-calling
    # use input "view" BED file or all chromosomes mentioned in "track":
    if view is None:
        # Generate viewframe from clr.chromsizes:
        view_df = bioframe.make_viewframe([(chrom, 0, clr.chromsizes[chrom])
                                           for chrom in clr.chromnames])
    else:
        # Make viewframe out of table:
        # Read view_df:
        try:
            view_df = bioframe.read_table(view, schema="bed4", index_col=False)
        except Exception:
            view_df = bioframe.read_table(view, schema="bed3", index_col=False)
        # Convert view_df to viewframe:
        try:
            view_df = bioframe.make_viewframe(view_df,
                                              check_bounds=clr.chromsizes)
        except ValueError as e:
            raise ValueError(
                "View table is incorrect, please, comply with the format. "
            ) from e

    # TODO: Add check that view_df has the same bins as track

    # it's contact_type dependent:
    if contact_type == "cis":
        eigvals, eigvec_table = eigdecomp.cooler_cis_eig(
            clr=clr,
            bins=track,
            view_df=view_df,
            n_eigs=n_eigs,
            phasing_track_col=track_name,
            clip_percentile=99.9,
            sort_metric=None,
        )
    elif contact_type == "trans":
        eigvals, eigvec_table = eigdecomp.cooler_trans_eig(
            clr=clr,
            bins=track,
            n_eigs=n_eigs,
            partition=None,
            phasing_track_col=track_name,
            sort_metric=None,
        )

    # Output
    eigvals.to_csv(out_prefix + "." + contact_type + ".lam.txt",
                   sep="\t",
                   index=False)
    eigvec_table.to_csv(out_prefix + "." + contact_type + ".vecs.tsv",
                        sep="\t",
                        index=False)
    if bigwig:
        bioframe.to_bigwig(
            eigvec_table,
            clr.chromsizes,
            out_prefix + "." + contact_type + ".bw",
            value_field="E1",
        )
示例#15
0
文件: score.py 项目: ysora/stripenn
def getScore(cool, coordinates, norm, numcores, out, mask='0'):
    # Just open without header
    table = pd.read_csv(coordinates, header=None, sep='\t')
    # test
    el = table.iloc[0]
    el1 = el[1]
    el2 = el[2]

    if type(el1) != str and type(el2) != str:
        pass
    else:
        table = pd.read_csv(coordinates, header=0, sep='\t')

    table.columns = ['chr', 'pos1', 'pos2', 'chr2', 'pos3', 'pos4'
                     ] + table.columns[6:].tolist()

    core = numcores
    Lib = cooler.Cooler(cool)
    PossibleNorm = Lib.bins().columns
    if norm == 'None':
        norm = False
    elif norm not in PossibleNorm:
        print('Possible normalization methods are:')
        print('None')
        for n in range(3, len(PossibleNorm)):
            print(PossibleNorm[n])
        print(
            "Invalid normalization method. Normalization method is forced to None"
        )
        norm = False

    all_chromnames = Lib.chromnames
    all_chromnames = [x for x in all_chromnames if x != "Y"]
    #all_chromsizes = Lib.chromsizes
    all_chromsizes = [
        Lib.chromsizes[i] for i in range(len(Lib.chromsizes))
        if Lib.chromnames[i] != "Y"
    ]
    all_chromsizes = np.array(all_chromsizes)
    chrom_remain_idx = np.where(all_chromsizes > 1000000)[0]
    all_chromnames = [all_chromnames[i] for i in chrom_remain_idx]
    all_chromsizes = all_chromsizes[chrom_remain_idx]
    chromnames = all_chromnames
    chromsizes = all_chromsizes
    unbalLib = Lib.matrix(balance=norm)
    resol = Lib._info['bin-size']
    obj = getStripe.getStripe(unbalLib, resol, 10, 8, 2.5, all_chromnames,
                              chromnames, all_chromsizes, chromsizes, core)
    EV = getStripe.getStripe.mpmean(obj)
    bgleft_up, bgright_up, bgleft_down, bgright_down = getStripe.getStripe.nulldist(
        obj)
    pval = getStripe.getStripe.pvalue(obj, bgleft_up, bgright_up, bgleft_down,
                                      bgright_down, table)
    table.insert(table.shape[1], 'pvalue_added', pval, True)
    MEAN, SUM = getStripe.getStripe.getMean(obj, table)
    s, MEANOE, TOTALOE = obj.scoringstripes(table, EV, mask)
    table.insert(table.shape[1], 'Stripiness_added', s, True)
    table.insert(table.shape[1], "O_Mean_added", MEAN, True)
    table.insert(table.shape[1], "O_Sum_added", SUM, True)
    table.insert(table.shape[1], 'O/E_Mean_added', MEANOE, True)
    table.insert(table.shape[1], 'O/E_Total_added', TOTALOE, True)
    table.to_csv(out, sep="\t", header=True, index=False)
def run(args):
    cool_file_1 = args.cool_1
    cool_file_2 = args.cool_2
    compartment_file = args.comp_file
    RNA_file = args.rna_file
    output_file = args.output
    comp_tb = tabix.open(compartment_file)
    polyA_RNA_tb = tabix.open(RNA_file)
    c1 = cooler.Cooler(cool_file_1)
    c2 = cooler.Cooler(cool_file_2)

    start = time.perf_counter()
    c1_matrix = c1.matrix(balance=False, sparse=False)[:]
    c2_matrix = c2.matrix(balance=False, sparse=False)[:]
    division_matrix = np.divide(c1_matrix, c2_matrix)
    division_matrix_noinf = np.where(np.isinf(division_matrix), np.nan, division_matrix)  # get rid of inf value
    division_matrix_noinf[division_matrix_noinf == 0] = np.nan # get rid of 0 value
    division_matrix_noinf_log = np.log10(division_matrix_noinf)

    chromosome_ls = []
    for i in range(1, 23):
        chromosome_ls.append('chr' + str(i))
    chromosome_ls.append('chrX')

    # to get the bin and corresponding locations as a dictionary
    bins = c1.bins()[:]  # get the bin dataframe from one of the Hic matrix
    bin_dic = {}
    chrom_bin_dic = {}
    for chromosome in chromosome_ls:
        chrom_bins = bins.loc[bins['chrom'] == chromosome]
        chrom_bin_dic[chromosome] = (chrom_bins.index[0], chrom_bins.index[-1])   # chrom_bin_dic take the absolute bin for each chromosome
    for index, row in bins.iterrows():  # bins is a pandas dataframe containing the bin information from the cooler file
        bin_dic[index] = [row[0], row[1], row[2]]

    bin_max = int(chrom_bin_dic['chrX'][1])  # bin_max is the largest bin considered
    trans_bin_sum_dic = {}
    cis_bin_sum_dic = {}
    for bin_num in range(0, chrom_bin_dic['chrX'][1] + 1):  # initialize the bin_sum_dic to 0 for each bin
        trans_bin_sum_dic[bin_num] = 0
        cis_bin_sum_dic[bin_num] = 0

    for chromosome in chromosome_ls:  # without chrY and chrM
        chr_start_bin = chrom_bin_dic[chromosome][0]
        chr_end_bin = chrom_bin_dic[chromosome][1]
        print(f"chromosome {chromosome} 's bin start at {chr_start_bin} and end at {chr_end_bin}")
        for i in range(chr_start_bin, chr_end_bin + 1):
            for j in range(chr_start_bin, chr_end_bin + 1):
                if not np.isnan(division_matrix_noinf_log[i, j]):
                    cis_bin_sum_dic[i] += division_matrix_noinf_log[i, j]
            for j1 in range(0, chr_start_bin):
                if not np.isnan(division_matrix_noinf_log[i, j1]):
                    print(division_matrix_noinf_log[i, j1])
                    trans_bin_sum_dic[i] += division_matrix_noinf_log[i, j1]
            for j2 in range(chr_end_bin + 1, bin_max + 1):
                if not np.isnan(division_matrix_noinf_log[i, j2]):
                    trans_bin_sum_dic[i] += division_matrix_noinf_log[i, j2]

    with open(output_file, "w") as f1:
        csv_writer = csv.writer(f1, delimiter='\t')
        csv_writer.writerow(["bin_num", "chr_name", "chr_start", "chr_end", "subcompartment_start", "subcompartment_end", "subcompartment", "cis_bins_sum", "trans_bin_sum", "total_bin_sum", "gene_type", "fkpm", "gene_id"])  # write the header of the output file
        for bin_number in range(0, chrom_bin_dic['chrX'][1]+1):
            chr_name = bin_dic[bin_number][0]  ## chrom_bin_dic looks like chrX: 'chrX': (892, 939)
            chr_start = int(bin_dic[bin_number][1])
            chr_end = int(bin_dic[bin_number][2])
            cis_bin_sum = cis_bin_sum_dic[bin_number]
            trans_bin_sum = trans_bin_sum_dic[bin_number]
            bin_sum = cis_bin_sum + trans_bin_sum
            comp_query = comp_tb.query(chr_name, chr_start, chr_end)
            comp_query_ls = []
            for comp_query_result in comp_query:
                print(comp_query_result)
                comp_query_ls.append(comp_query_result)
            for comp_query_item in comp_query_ls:
                if int(comp_query_item[1]) <= chr_start:
                    comp_start = chr_start
                else:
                    comp_start = int(comp_query_item[1])

                if int(comp_query_item[2]) >= chr_end:
                    comp_end = chr_end
                else:
                    comp_end = int(comp_query_item[2])
                compartment = comp_query_item[3]
                polyA_RNA_query = polyA_RNA_tb.query(chr_name, comp_start, comp_end)
                for query_result in polyA_RNA_query:
                    if query_result[7] != "nan\r":  # row 8 of the query result contain FKPM information of the gene
                        gene_type = query_result[6].split(";")[2].split("=")[1]
                        gene_id = query_result[6].split(";")[1].split("=")[1]
                        fkpm_value = float(query_result[7])
                        csv_writer.writerow([bin_number, chr_name, chr_start, chr_end, comp_start, comp_end, compartment, cis_bin_sum, trans_bin_sum, bin_sum, gene_type, fkpm_value, gene_id])
    end = time.perf_counter()
    print(f"Analysis finished in {round(end - start, 2)} seconds")
示例#17
0
def read_mcooler(f, distance_in_bp, chr1, chr2, res):
    """
    :param f: .cool file path
    :param chr: Which chromosome to read the file for
    :param res: Resolution to extract information from
    :return: Numpy matrix of contact counts
    """
    uri = '%s::/resolutions/%s' % (f, res)
    #uri = '%s::/7' % (f)
    clr = cooler.Cooler(uri)
    if chr1 not in clr.chromnames or chr2 not in clr.chromnames:
        raise NameError('wrong chromosome name!')
    CHRM_SIZE = clr.chromsizes[chr1]    
    CHUNK_SIZE = max(2*distance_in_bp/res, 2000)
    start = 0
    end = min(CHRM_SIZE, CHUNK_SIZE*res) #CHUNK_SIZE*res
    result = []

	
    if chr1 == chr2:
        try:
            #result = clr.matrix(balance=True,sparse=True).fetch(chr1)#as_pixels=True, join=True
            while start < CHRM_SIZE:                
                temp = clr.matrix(balance=True,sparse=True).fetch( (chr1, int(start), int(end)))
                temp = sparse.triu(temp)
                np.nan_to_num(temp, copy=False, nan=0, posinf=0, neginf=0)
                start_in_px = int(start/res)
                if len(temp.row)==0:
                    start = start + CHUNK_SIZE*res -  distance_in_bp
                    end = end + CHUNK_SIZE*res - distance_in_bp
                    #print('row=0')
                    print(start,end)
                    continue
           
                if result == []:
                    result+= [list(start_in_px+temp.row),list(start_in_px+temp.col),list(temp.data)] 
                    prev_block = set([(x,y,v) for x,y,v in zip(start_in_px+temp.row,start_in_px+temp.col,temp.data)])
                    #print('result==[]')
                else:
                    cur_block = set([(x,y,v) for x,y,v in zip(start_in_px+temp.row,start_in_px+temp.col,temp.data)])
                    to_add_list = list(cur_block - prev_block)
                    del prev_block
                    result[0]+=  [x[0] for x in  to_add_list]
                    result[1]+=  [x[1] for x in  to_add_list]
                    result[2]+=  [x[2] for x in  to_add_list]
                    prev_block = cur_block
                    del cur_block

                print(start,end)
                start = min( start + CHUNK_SIZE*res -  distance_in_bp, CHRM_SIZE)
                end = min(end + CHUNK_SIZE*res - distance_in_bp, CHRM_SIZE-1)                
        except:
            raise NameError('Reading from the file failed!')
        x = np.array(result[0])
        y = np.array(result[1])
        val = np.array(result[2])
    else:
        result = clr.matrix(balance=True,sparse=True).fetch(chr1, chr2)    
        result = sparse.triu(result)
        np.nan_to_num(result, copy=False, nan=0, posinf=0, neginf=0)
        x = result.row
        y = result.col
        val = result.data

    val[np.isnan(val)] = 0
    if(chr1==chr2):
        dist_f = np.logical_and(np.abs(x-y) <= distance_in_bp/res, val > 0)
        x = x[dist_f]
        y = y[dist_f]
        val = val[dist_f]
    return np.array(x),np.array(y),np.array(val)
示例#18
0
import cooler
import hic_basics
import argparse

parser = argparse.ArgumentParser(
    description='Run intrachromosomal clustering algorithm.')
parser.add_argument('-cool',
                    required=True,
                    help="Cooler file containing Hi-C data.")
parser.add_argument(
    '-outdir',
    required=True,
    help="Output dir, where 'clusters_chr*.txt' files will be generated.")
parser.add_argument('-k', required=True, help="Number of clusters.")
params = parser.parse_args()

cf = cooler.Cooler(params.cool)

hic_basics.cluster_compartments(cf=cf,
                                k=params.k,
                                chrlist=cf.chromnames,
                                outdir=params.outdir)