예제 #1
0
def load_hic_data_from_reads(fnam, resolution, **kwargs):
    """
    :param fnam: tsv file with reads1 and reads2
    :param resolution: the resolution of the experiment (size of a bin in
       bases)
    :param genome_seq: a dictionary containing the genomic sequence by
       chromosome
    :param False get_sections: for very very high resolution, when the column
       index does not fit in memory
    """
    sections = []
    genome_seq = OrderedDict()
    fhandler = open(fnam)
    line = fhandler.next()
    size = 0
    while line.startswith('#'):
        if line.startswith('# CRM '):
            crm, clen = line[6:].split()
            genome_seq[crm] = int(clen) / resolution + 1
            size += genome_seq[crm]
        line = fhandler.next()
    section_sizes = {}
    if kwargs.get('get_sections', True):
        for crm in genome_seq:
            len_crm = genome_seq[crm]
            section_sizes[(crm,)] = len_crm
            sections.extend([(crm, i) for i in xrange(len_crm)])
    dict_sec = dict([(j, i) for i, j in enumerate(sections)])
    imx = HiC_data((), size, genome_seq, dict_sec, resolution=resolution)
    try:
        while True:
            _, cr1, ps1, _, _, _, _, cr2, ps2, _ = line.split('\t', 9)
            try:
                ps1 = dict_sec[(cr1, int(ps1) / resolution)]
                ps2 = dict_sec[(cr2, int(ps2) / resolution)]
            except KeyError:
                ps1 = int(ps1) / resolution
                ps2 = int(ps2) / resolution
            imx[ps1, ps2] += 1
            imx[ps2, ps1] += 1
            line = fhandler.next()
    except StopIteration:
        pass
    imx.symmetricized = True
    return imx
예제 #2
0
def load_hic_data_from_reads(fnam, resolution, **kwargs):
    """
    :param fnam: tsv file with reads1 and reads2
    :param resolution: the resolution of the experiment (size of a bin in
       bases)
    :param genome_seq: a dictionary containing the genomic sequence by
       chromosome
    :param False get_sections: for very very high resolution, when the column
       index does not fit in memory
    """
    sections = []
    genome_seq = OrderedDict()
    fhandler = open(fnam)
    line = fhandler.next()
    size = 0
    while line.startswith('#'):
        if line.startswith('# CRM '):
            crm, clen = line[6:].split()
            genome_seq[crm] = int(clen) / resolution + 1
            size += genome_seq[crm]
        line = fhandler.next()
    section_sizes = {}
    if kwargs.get('get_sections', True):
        for crm in genome_seq:
            len_crm = genome_seq[crm]
            section_sizes[(crm, )] = len_crm
            sections.extend([(crm, i) for i in xrange(len_crm)])
    dict_sec = dict([(j, i) for i, j in enumerate(sections)])
    imx = HiC_data((), size, genome_seq, dict_sec, resolution=resolution)
    try:
        while True:
            _, cr1, ps1, _, _, _, _, cr2, ps2, _ = line.split('\t', 9)
            try:
                ps1 = dict_sec[(cr1, int(ps1) / resolution)]
                ps2 = dict_sec[(cr2, int(ps2) / resolution)]
            except KeyError:
                ps1 = int(ps1) / resolution
                ps2 = int(ps2) / resolution
            imx[ps1, ps2] += 1
            imx[ps2, ps1] += 1
            line = fhandler.next()
    except StopIteration:
        pass
    imx.symmetricized = True
    return imx
예제 #3
0
def optimal_reader(f, normalized=False, resolution=1):
    """
    Reads a matrix generated by TADbit.
    Can be slower than autoreader, but uses almost a third of the memory

    :param f: an iterable (typically an open file).
    :param False normalized: if the matrix is normalized
    :param 1 resolution: resolution of the matrix

    """
    # get masked bins
    masked = {}
    pos = 0
    for line in f:
        if line[0] != '#':
            break
        pos += len(line)
        if line.startswith('# MASKED'):
            masked = dict([(int(n), True) for n in line.split()[2:]])
    f.seek(pos)

    # super fast
    header = [tuple(line.split(None, 2)[:2]) for line in f]

    f.seek(pos)

    ncol = len(header)

    # Get the numeric values and remove extra columns
    num = float if normalized else int
    chromosomes, sections, resolution = _header_to_section(header, resolution)

    #############################################################
    # monkey patch HiC_data to make it faster
    def fast_setitem(self, key, val):
        "Use directly dict setitem"
        super(HiC_data, self).__setitem__(key, val)

    def fast_getitem(self, key):
        "Use directly dict setitem"
        try:
            return super(HiC_data, self).__getitem__(key)
        except KeyError:
            return 0

    original_setitem = HiC_data.__setitem__
    original_getitem = HiC_data.__getitem__
    # apply_async the patch
    HiC_data.__setitem__ = fast_setitem
    HiC_data.__getitem__ = fast_getitem

    hic = HiC_data(
        ((j, num(v)) for i, line in enumerate(f)
         for j, v in enumerate(line.split()[2:], i * ncol) if num(v)),
        size=ncol,
        masked=masked,
        dict_sec=sections,
        chromosomes=chromosomes,
        resolution=resolution,
        symmetricized=False)

    # make it symmetric
    if is_asymmetric_dico(hic):
        hic.symmetricized = True
        symmetrize_dico(hic)

    # undo patching
    HiC_data.__setitem__ = original_setitem
    HiC_data.__getitem__ = original_getitem
    hic.__setitem__ = original_setitem
    hic.__getitem__ = original_getitem
    #############################################################
    return hic
예제 #4
0
def load_hic_data_from_bam(fnam,
                           resolution,
                           biases=None,
                           tmpdir='.',
                           ncpus=8,
                           filter_exclude=(1, 2, 3, 4, 6, 7, 8, 9, 10),
                           region=None,
                           verbose=True,
                           clean=True):
    """
    :param fnam: TADbit-generated BAM file with read-ends1 and read-ends2
    :param resolution: the resolution of the experiment (size of a bin in
       bases)
    :param None biases: path to pickle file where are stored the biases. Keys
       in this file should be: 'biases', 'badcol', 'decay' and 'resolution'
    :param '.' tmpdir: path to folder where to create temporary files
    :param 8 ncpus:
    :param (1, 2, 3, 4, 6, 7, 8, 9, 10) filter exclude: filters to define the
       set of valid pair of reads.
    :param None region: chromosome name, if None, all genome will be loaded

    :returns: HiC_data object
    """
    bam = AlignmentFile(fnam)
    genome_seq = OrderedDict((c, l) for c, l in zip(
        bam.references, [x / resolution + 1 for x in bam.lengths]))
    bam.close()

    sections = []
    for crm in genome_seq:
        len_crm = genome_seq[crm]
        sections.extend([(crm, i) for i in xrange(len_crm)])

    size = sum(genome_seq.values())

    chromosomes = {region: genome_seq[region]} if region else genome_seq
    dict_sec = dict([(j, i) for i, j in enumerate(sections)])
    imx = HiC_data((),
                   size,
                   chromosomes=chromosomes,
                   dict_sec=dict_sec,
                   resolution=resolution)

    if biases:
        if isinstance(biases, basestring):
            biases = load(open(biases))
        if biases['resolution'] != resolution:
            raise Exception('ERROR: resolution of biases do not match to the '
                            'one wanted (%d vs %d)' %
                            (biases['resolution'], resolution))
        if region:
            chrom_start = 0
            for crm in genome_seq:
                if crm == region:
                    break
                len_crm = genome_seq[crm]
                chrom_start += len_crm
            imx.bads = dict((b - chrom_start, biases['badcol'][b])
                            for b in biases['badcol'])
            imx.bias = dict((b - chrom_start, biases['biases'][b])
                            for b in biases['biases'])
        else:
            imx.bads = biases['badcol']
            imx.bias = biases['biases']
        imx.expected = biases['decay']

    get_matrix(fnam,
               resolution,
               biases=None,
               filter_exclude=filter_exclude,
               normalization='raw',
               tmpdir=tmpdir,
               clean=clean,
               ncpus=ncpus,
               dico=imx,
               region1=region,
               verbose=verbose)
    imx._symmetricize()
    imx.symmetricized = True

    return imx
예제 #5
0
def optimal_reader(f, normalized=False, resolution=1):
    """
    Reads a matrix generated by TADbit.
    Can be slower than autoreader, but uses almost a third of the memory

    :param f: an iterable (typically an open file).
    :param False normalized: if the matrix is normalized
    :param 1 resolution: resolution of the matrix

    """
    # get masked bins
    masked = {}
    pos = 0
    for line in f:
        if line[0] != '#':
            break
        pos += len(line)
        if line.startswith('# MASKED'):
            masked = dict([(int(n), True) for n in line.split()[2:]])
    f.seek(pos)

    # super fast
    header = [tuple(line.split(None, 2)[:2]) for line in f]

    f.seek(pos)

    ncol = len(header)

    # Get the numeric values and remove extra columns
    num = float if normalized else int
    chromosomes, sections, resolution = _header_to_section(header, resolution)

    #############################################################
    # monkey patch HiC_data to make it faster
    def fast_setitem(self, key, val):
        "Use directly dict setitem"
        super(HiC_data, self).__setitem__(key, val)

    def fast_getitem(self, key):
        "Use directly dict setitem"
        try:
            return super(HiC_data, self).__getitem__(key)
        except KeyError:
            return 0

    original_setitem = HiC_data.__setitem__
    original_getitem = HiC_data.__getitem__
    # apply_async the patch
    HiC_data.__setitem__ = fast_setitem
    HiC_data.__getitem__ = fast_getitem

    hic = HiC_data(((j, num(v))
                    for i, line in enumerate(f)
                    for j, v in enumerate(line.split()[2:], i * ncol)
                    if num(v)), size=ncol, masked=masked,
                   dict_sec=sections, chromosomes=chromosomes,
                   resolution=resolution, symmetricized=False)

    # make it symmetric
    if is_asymmetric_dico(hic):
        hic.symmetricized = True
        symmetrize_dico(hic)

    # undo patching
    HiC_data.__setitem__ = original_setitem
    HiC_data.__getitem__ = original_getitem
    hic.__setitem__ = original_setitem
    hic.__getitem__ = original_getitem
    #############################################################
    return hic
예제 #6
0
def load_hic_data_from_bam(fnam, resolution, biases=None, tmpdir='.', ncpus=8,
                           filter_exclude=(1, 2, 3, 4, 6, 7, 8, 9, 10),
                           region=None, verbose=True, clean=True):
    """
    :param fnam: TADbit-generated BAM file with read-ends1 and read-ends2
    :param resolution: the resolution of the experiment (size of a bin in
       bases)
    :param None biases: path to pickle file where are stored the biases. Keys
       in this file should be: 'biases', 'badcol', 'decay' and 'resolution'
    :param '.' tmpdir: path to folder where to create temporary files
    :param 8 ncpus:
    :param (1, 2, 3, 4, 6, 7, 8, 9, 10) filter exclude: filters to define the
       set of valid pair of reads.
    :param None region: chromosome name, if None, all genome will be loaded

    :returns: HiC_data object
    """
    bam = AlignmentFile(fnam)
    genome_seq = OrderedDict((c, l) for c, l in
                             zip(bam.references,
                                 [x / resolution + 1 for x in bam.lengths]))
    bam.close()

    sections = []
    for crm in genome_seq:
        len_crm = genome_seq[crm]
        sections.extend([(crm, i) for i in xrange(len_crm)])

    size = sum(genome_seq.values())

    chromosomes = {region: genome_seq[region]} if region else genome_seq
    dict_sec = dict([(j, i) for i, j in enumerate(sections)])
    imx = HiC_data((), size, chromosomes=chromosomes, dict_sec=dict_sec,
                   resolution=resolution)

    if biases:
        if isinstance(biases, basestring):
            biases = load(open(biases))
        if biases['resolution'] != resolution:
            raise Exception('ERROR: resolution of biases do not match to the '
                            'one wanted (%d vs %d)' % (
                                biases['resolution'], resolution))
        if region:
            chrom_start = 0
            for crm in genome_seq:
                if crm == region:
                    break
                len_crm = genome_seq[crm]
                chrom_start += len_crm
            imx.bads     = dict((b - chrom_start, biases['badcol'][b]) for b in biases['badcol'])
            imx.bias     = dict((b - chrom_start, biases['biases'][b]) for b in biases['biases'])
        else:
            imx.bads     = biases['badcol']
            imx.bias     = biases['biases']
        imx.expected = biases['decay']

    get_matrix(fnam, resolution, biases=None, filter_exclude=filter_exclude,
               normalization='raw', tmpdir=tmpdir, clean=clean,
               ncpus=ncpus, dico=imx, region1=region, verbose=verbose)
    imx._symmetricize()
    imx.symmetricized = True

    return imx