Python ContigSet примеры, tefingerprint.loci.ContigSet Python примеры использования

Пример #1

0

Показать файл

Файл: test_loci.py Проект: wang-wldlsw/TEFingerprint

    def test_map(self):
        dtype_loci = np.dtype([('tip', np.int64), ('element', 'O')])

        header_1 = loci.Header(reference='chr1', strand='+', category='gypsy')
        contig_1 = loci.Contig(
            header_1, np.array([(1, 'gypsy1'), (7, 'gypsy4')],
                               dtype=dtype_loci))
        contig_alt_1 = loci.Contig(
            header_1,
            np.array([(101, 'gypsy1'), (107, 'gypsy4')], dtype=dtype_loci))

        header_2 = loci.Header(reference='chr2', strand='+', category='gypsy')
        contig_2 = loci.Contig(
            header_2, np.array([(3, 'gypsy7'), (9, 'gypsy1')],
                               dtype=dtype_loci))
        contig_alt_2 = loci.Contig(
            header_2,
            np.array([(103, 'gypsy7'), (109, 'gypsy1')], dtype=dtype_loci))

        query = loci.ContigSet(contig_1, contig_2)

        def func(contig):
            """dummy function that adds 100 to contig loci 'tip's"""
            array = np.copy(contig.loci)
            array['tip'] += 100
            return loci.Contig(contig.header, array)

        query = query.map(func)

        answer = loci.ContigSet(contig_alt_1, contig_alt_2)

        assert query == answer

Пример #2

0

Показать файл

Файл: test_fingerprint.py Проект: wang-wldlsw/TEFingerprint

def test_create_contig_ids():
    dtype_loci_query = np.dtype([('start', np.int64), ('stop', np.int64),
                                 ('element', 'O')])

    query = loci.ContigSet(
        loci.Contig(
            loci.Header(reference='chr1', strand='+', category='gypsy'),
            np.array([(1, 5, 'gypsy1'), (7, 9, 'gypsy4')],
                     dtype=dtype_loci_query)),
        loci.Contig(
            loci.Header(reference='chr1', strand='-', category='gypsy'),
            np.array([(3, 8, 'gypsy7'), (9, 12, 'gypsy1')],
                     dtype=dtype_loci_query)))

    dtype_loci_answer = np.dtype([('start', np.int64), ('stop', np.int64),
                                  ('element', 'O'), ('ID', 'O')])

    answer = loci.ContigSet(
        loci.Contig(
            loci.Header(reference='chr1', strand='+', category='gypsy'),
            np.array([(1, 5, 'gypsy1', 'gypsy_chr1_+_5'),
                      (7, 9, 'gypsy4', 'gypsy_chr1_+_9')],
                     dtype=dtype_loci_answer)),
        loci.Contig(
            loci.Header(reference='chr1', strand='-', category='gypsy'),
            np.array([(3, 8, 'gypsy7', 'gypsy_chr1_-_3'),
                      (9, 12, 'gypsy1', 'gypsy_chr1_-_9')],
                     dtype=dtype_loci_answer)))

    assert query.map(fingerprint.create_contig_ids) == answer

Пример #3

0

Показать файл

Файл: test_fingerprint.py Проект: wang-wldlsw/TEFingerprint

def test_count_reads_n2():
    dtype_loci_reads = np.dtype([('tip', np.int64), ('element', 'O')])

    reads = loci.ContigSet(
        loci.Contig(
            loci.Header(reference='chr1',
                        strand='+',
                        category='gypsy',
                        source='bam1'),
            np.array([(2, 'gypsy1'), (4, 'gypsy1'), (5, 'gypsy4'),
                      (7, 'gypsy4'), (7, 'gypsy7'), (7, 'gypsy1'),
                      (8, 'gypsy1'), (8, 'gypsy1')],
                     dtype=dtype_loci_reads)),
        loci.Contig(
            loci.Header(reference='chr1',
                        strand='+',
                        category='gypsy',
                        source='bam2'),
            np.array([(3, 'gypsy1'), (4, 'gypsy1'), (6, 'gypsy1'),
                      (7, 'gypsy1'), (7, 'gypsy1'), (7, 'gypsy1'),
                      (7, 'gypsy1'), (50, 'gypsy7')],
                     dtype=dtype_loci_reads)))

    dtype_loci_query = np.dtype([('start', np.int64), ('stop', np.int64)])

    query = loci.ContigSet(
        loci.Contig(
            loci.Header(reference='chr1', strand='+', category='gypsy'),
            np.array([(1, 15), (30, 60)], dtype=dtype_loci_query)))

    dtype_loci_answer = np.dtype([
        ('start', np.int64), ('stop', np.int64), ('median', np.int64),
        ('sample', [('0', [('name', 'O'), ('count', np.int64),
                           ('element', [('0', [('name', 'O'),
                                               ('count', np.int64)]),
                                        ('1', [('name', 'O'),
                                               ('count', np.int64)])])]),
                    ('1', [('name', 'O'), ('count', np.int64),
                           ('element', [('0', [('name', 'O'),
                                               ('count', np.int64)]),
                                        ('1', [('name', 'O'),
                                               ('count', np.int64)])])])])
    ])

    answer = loci.ContigSet(
        loci.Contig(
            loci.Header(reference='chr1', strand='+', category='gypsy'),
            np.array([(2, 8, 7, (('bam1', 8, (('gypsy1', 5), ('gypsy4', 2))),
                                 ('bam2', 7, (('gypsy1', 7), ('.', 0))))),
                      (50, 50, 50, (('bam1', 0, (('.', 0), ('.', 0))),
                                    ('bam2', 1, (('gypsy7', 1), ('.', 0)))))],
                     dtype=dtype_loci_answer)))

    assert fingerprint.count_reads(query,
                                   reads,
                                   trim=True,
                                   n_common_elements=2) == answer

Пример #4

0

Показать файл

Файл: test_fingerprintio.py Проект: wang-wldlsw/TEFingerprint

def test_extract_gff_intervals():
    gff = DATA_PATH + 'testAnnotation-2017-11-27.gff'

    query = fingerprintio.extract_gff_intervals(gff, 'chr1',
                                                ['Gypsy', 'Copia'])

    dtype_loci = np.dtype([('start', np.int64), ('stop', np.int64),
                           ('element', '<O')])

    answer = loci.ContigSet(
        loci.Contig(
            loci.Header(reference='chr1',
                        category='Gypsy',
                        source='testAnnotation-2017-11-27.gff'),
            np.array([(3150, 3200, 'Gypsy-21_ClassI;chr1:3150-3200'),
                      (24250, 24700, 'Gypsy-21_ClassI;chr1:24250-24700')],
                     dtype=dtype_loci)),
        loci.Contig(
            loci.Header(reference='chr1',
                        category='Copia',
                        source='testAnnotation-2017-11-27.gff'),
            np.array([(98260, 98322, 'Copia-10_ClassI;chr1:98260-98322')],
                     dtype=dtype_loci)))

    assert query == answer

Пример #5

0

Показать файл

Файл: test_loci.py Проект: wang-wldlsw/TEFingerprint

    def test_add_append_headers(self):
        """Contigs with same header should be appended"""
        dtype_loci = np.dtype([('tip', np.int64), ('element', 'O')])

        header = loci.Header(reference='chr1', strand='+', category='gypsy')

        contig_1 = loci.Contig(
            header, np.array([(1, 'gypsy1'), (7, 'gypsy4')], dtype=dtype_loci))

        contig_2 = loci.Contig(
            header, np.array([(3, 'gypsy7'), (9, 'gypsy1')], dtype=dtype_loci))

        query = loci.ContigSet(contig_1)
        query.add(contig_2, append_duplicate_headers=True)

        assert len(query) == 4
        assert len(list(query.contigs())) == 1
        assert len(query.headers()) == 1

        query_loci = list(query.contigs())[0].loci

        answer_loci = np.array([(1, 'gypsy1'), (7, 'gypsy4'), (3, 'gypsy7'),
                                (9, 'gypsy1')],
                               dtype=dtype_loci)

        npt.assert_array_equal(query_loci, answer_loci)

Пример #6

0

Показать файл

Файл: fingerprint.py Проект: wang-wldlsw/TEFingerprint

def match_known_insertions(clusters, known_insertions, distance=0):
    """
    Match clusters to known insertions annotated in the genome.

    Known insertions are represented as an object of
    :class:`loci.ContigSet` created from a gff file.
    Clusters are matched to a known insertion if they are for the
    same category and are within the specified distance of the
    insertions end.

    Fields required in 'clusters':
        'start': int, 'stop': int, 'median': int

    Fields required in 'known_insertions':
        'start': int, 'stop': int, 'element': str

    Fields appended to return value:
        'known_element': str

    :param clusters: a collection of cluster loci (intervals)
    :type clusters: :class:`loci.ContigSet`
    :param known_insertions: a collection of cluster loci (intervals)
    :type known_insertions: :class:`loci.ContigSet`
    :param distance: maximum distance for connecting a cluster to a
        known insertion
    :type distance: int

    :return: a collection of cluster loci (intervals) tagged with
        known insertions
    :rtype: :class:`loci.ContigSet`
    """
    matched = loci.ContigSet()

    # make known insertion headers un-stranded and drop origin file
    known_insertions = known_insertions.map(lambda x:
                                            loci.mutate_header(x, strand='.',
                                                               source=None))

    # loop through contigs
    for contig in clusters.contigs():

        # get relevant known insertions
        known = known_insertions[contig.header.mutate(strand='.')]

        matches = np.array(list(_known_insertion_matcher(contig,
                                                         known,
                                                         distance=distance)))
        matches = np.array(matches,
                           dtype=np.dtype([('known_element', '<O')]))

        matched.add(loci.Contig(contig.header,
                                util.numpy.array.bind(contig.loci,
                                                      matches)))

    return matched

Пример #7

0

Показать файл

Файл: test_loci.py Проект: wang-wldlsw/TEFingerprint

    def test_update(self):
        dtype_loci = np.dtype([('tip', np.int64), ('element', 'O')])

        header_1 = loci.Header(reference='chr1', strand='+', category='gypsy')
        contig_1 = loci.Contig(
            header_1, np.array([(1, 'gypsy1'), (7, 'gypsy4')],
                               dtype=dtype_loci))

        header_2 = loci.Header(reference='chr2', strand='+', category='gypsy')
        contig_2 = loci.Contig(
            header_2, np.array([(3, 'gypsy7'), (9, 'gypsy1')],
                               dtype=dtype_loci))

        query = loci.ContigSet(contig_1)
        query_2 = loci.ContigSet(contig_2)
        query.update(query_2.contigs())

        assert len(query) == 4
        assert len(list(query.contigs())) == 2
        assert len(query.headers()) == 2

Пример #8

0

Показать файл

Файл: test_loci.py Проект: wang-wldlsw/TEFingerprint

    def test_headers(self):
        dtype_loci = np.dtype([('tip', np.int64), ('element', 'O')])

        header_1 = loci.Header(reference='chr1', strand='+', category='gypsy')
        contig_1 = loci.Contig(
            header_1, np.array([(1, 'gypsy1'), (7, 'gypsy4')],
                               dtype=dtype_loci))

        header_2 = loci.Header(reference='chr2', strand='+', category='gypsy')
        contig_2 = loci.Contig(
            header_2, np.array([(3, 'gypsy7'), (9, 'gypsy1')],
                               dtype=dtype_loci))

        query = set(loci.ContigSet(contig_1, contig_2).headers())
        answer = {header_1, header_2}

        assert query == answer

Пример #9

0

Показать файл

Файл: test_loci.py Проект: wang-wldlsw/TEFingerprint

    def test_dtype_loci(self):
        dtype_loci = np.dtype([('tip', np.int64), ('element', 'O')])

        header_1 = loci.Header(reference='chr1', strand='+', category='gypsy')
        contig_1 = loci.Contig(
            header_1, np.array([(1, 'gypsy1'), (7, 'gypsy4')],
                               dtype=dtype_loci))

        header_2 = loci.Header(reference='chr2', strand='+', category='gypsy')
        contig_2 = loci.Contig(
            header_2, np.array([(3, 'gypsy7'), (9, 'gypsy1')],
                               dtype=dtype_loci))

        query = loci.ContigSet(contig_1, contig_2)

        assert query.dtype_loci() == contig_1.loci.dtype
        assert query.dtype_loci() == contig_1.loci.dtype

Пример #10

0

Показать файл

Файл: test_loci.py Проект: wang-wldlsw/TEFingerprint

    def test_init_clashing_headers(self):
        """Contigs with same header should cause ValueError"""
        dtype_loci = np.dtype([('tip', np.int64), ('element', 'O')])

        header = loci.Header(reference='chr1', strand='+', category='gypsy')

        contig_1 = loci.Contig(
            header, np.array([(1, 'gypsy1'), (7, 'gypsy4')], dtype=dtype_loci))

        contig_2 = loci.Contig(
            header, np.array([(3, 'gypsy7'), (9, 'gypsy1')], dtype=dtype_loci))

        try:
            loci.ContigSet(contig_1, contig_2)
        except ValueError:
            assert True
        else:
            assert False

Пример #11

0

Показать файл

Файл: test_loci.py Проект: wang-wldlsw/TEFingerprint

    def test_iter_values(self):
        dtype_loci = np.dtype([('tip', np.int64), ('element', 'O')])

        header_1 = loci.Header(reference='chr1', strand='+', category='gypsy')
        contig_1 = loci.Contig(
            header_1, np.array([(1, 'gypsy1'), (7, 'gypsy4')],
                               dtype=dtype_loci))

        header_2 = loci.Header(reference='chr2', strand='+', category='gypsy')
        contig_2 = loci.Contig(
            header_2, np.array([(3, 'gypsy7'), (9, 'gypsy1')],
                               dtype=dtype_loci))

        query = loci.ContigSet(contig_1, contig_2)

        answer = {('chr1', '+', 'gypsy', 1, 'gypsy1'),
                  ('chr1', '+', 'gypsy', 7, 'gypsy4'),
                  ('chr2', '+', 'gypsy', 3, 'gypsy7'),
                  ('chr2', '+', 'gypsy', 9, 'gypsy1')}

        assert set(query.iter_values()) == answer

Пример #12

0

Показать файл

Файл: fingerprint.py Проект: wang-wldlsw/TEFingerprint

def fingerprint(bams,
                categories,
                references,
                minimum_reads,
                epsilon,
                minimum_epsilon=0,
                n_common_elements=0,
                method='SDBICAN',
                fingerprint_buffer=0,
                join_distance=0,
                quality=0,
                transposon_tag='ME',
                annotation=None,
                max_count_proportion=True,
                cores=1):
    """
    Create a transposon fingerprint of one or more bam files.

    :param bams:
    :param categories:
    :param references:
    :param minimum_reads:
    :param epsilon:
    :param minimum_epsilon:
    :param n_common_elements:
    :param method:
    :param fingerprint_buffer:
    :param join_distance:
    :param quality:
    :param transposon_tag:
    :param annotation:
    :param max_count_proportion:
    :param cores:
    :return:
    """

    if isinstance(bams, str):
        bams = [bams]

    if isinstance(references, str):
        references = [references]

    if isinstance(categories, str):
        categories = [categories]

    if references == [None]:
        references = fingerprintio.extract_references_from_bams(*bams)

    jobs = product([bams],
                   [annotation],
                   [categories],
                   references,  # job per reference
                   [quality],
                   [transposon_tag],
                   [minimum_reads],
                   [epsilon],
                   [minimum_epsilon],
                   [n_common_elements],
                   [method],
                   [fingerprint_buffer],
                   [join_distance],
                   [max_count_proportion])

    result = loci.ContigSet()

    if cores == 1:
        # run on a single process
        for job in jobs:
            result.update(_fingerprint_dispatch(*job).contigs())
    else:
        # create a pool of processes
        with Pool(cores) as pool:
            parts = pool.starmap(_fingerprint_dispatch, jobs)
            for part in parts:
                result.update(part.contigs())

    return result

Пример #13

0

Показать файл

Файл: test_fingerprintio.py Проект: wang-wldlsw/TEFingerprint

def test_extract_informative_read_tips():
    """
    Test extraction of informative reads.
    Not all families of reads extracted.
    Family with no reads ('NOT-A-FAMILY') extracted.
    """
    bam = DATA_PATH + 'testA-2017-06-08.bam'

    query = fingerprintio.extract_informative_read_tips(
        bam,
        'chr1', ['Gypsy', 'PIF-Harbinger', 'NOT-A-FAMILY'],
        quality=0,
        tag='ME')

    dtype_loci = np.dtype([('tip', np.int64), ('element', 'O')])

    answer = loci.ContigSet(
        loci.Contig(
            loci.Header(reference='chr1',
                        strand='+',
                        category='Gypsy',
                        source='testA-2017-06-08.bam'),
            np.array([(2452, 'Gypsy_Gypsy26_chr15_18793972'),
                      (2506, 'Gypsy_Gypsy26_chr15_18793972'),
                      (2553, 'Gypsy_Gypsy26_chr15_18793972'),
                      (2566, 'Gypsy_Gypsy26_chr15_18793972'),
                      (2577, 'Gypsy_Gypsy26_chr15_18793972'),
                      (2577, 'Gypsy_Gypsy26_chr15_18793972'),
                      (2841, 'Gypsy_Gypsy26_chr15_18793972'),
                      (2841, 'Gypsy_Gypsy26_chr15_18793972'),
                      (2841, 'Gypsy_Gypsy26_chr8_2502854'),
                      (2973, 'Gypsy_Gypsy26_chr18_27801424'),
                      (3024, 'Gypsy_Gypsy26_chr8_5114633'),
                      (3062, 'Gypsy_Gypsy26_chr8_5114633'),
                      (3039, 'Gypsy_Gypsy26_chr2_1987286'),
                      (3138, 'Gypsy_Gypsy26_chr18_27801424'),
                      (24065, 'Gypsy_Gypsy12_chr1_12715223'),
                      (24184, 'Gypsy_Gypsy7_chr4_10302390'),
                      (24195, 'Gypsy_Gypsy12_chr1_12715223'),
                      (24217, 'Gypsy_Gypsy12_chr1_12715223')],
                     dtype=dtype_loci)),
        loci.Contig(
            loci.Header(reference='chr1',
                        strand='-',
                        category='Gypsy',
                        source='testA-2017-06-08.bam'),
            np.array([(3217, 'Gypsy_Gypsy26_chr15_18793972'),
                      (3226, 'Gypsy_Gypsy26_chr15_18793972'),
                      (3246, 'Gypsy_Gypsy26_chr15_18793972'),
                      (3405, 'Gypsy_Gypsy26_chr2_1987286'),
                      (3646, 'Gypsy_Gypsy26_chr15_18793972'),
                      (3776, 'Gypsy_Gypsy26_chr18_27801424'),
                      (3779, 'Gypsy_Gypsy26_chr8_5114633'),
                      (3800, 'Gypsy_Gypsy26_chr8_5114633'),
                      (24787, 'Gypsy_Gypsy7_chr4_10302390'),
                      (24799, 'Gypsy_Gypsy29_chr11_13193899'),
                      (24850, 'Gypsy_Gypsy7_chr4_10302390'),
                      (24854, 'Gypsy_Gypsy12_chr1_12715223'),
                      (24857, 'Gypsy_Gypsy23_chr15_8310356'),
                      (24860, 'Gypsy_Gypsy23_chrUn_38723460'),
                      (24872, 'Gypsy_Gypsy23_chrUn_38723460'),
                      (24877, 'Gypsy_GYVIT1_chr6_13115950'),
                      (24894, 'Gypsy_Gypsy23_chrUn_38723460'),
                      (24895, 'Gypsy_Gypsy12_chr1_12715223'),
                      (24910, 'Gypsy_Gypsy23_chr14_11656393'),
                      (24919, 'Gypsy_Gypsy23_chrUn_38723460')],
                     dtype=dtype_loci)),
        loci.Contig(
            loci.Header(reference='chr1',
                        strand='+',
                        category='PIF-Harbinger',
                        source='testA-2017-06-08.bam'),
            np.array([(21282, 'PIF-Harbinger_Harbinger-3N3_chr16_20723579'),
                      (21308, 'PIF-Harbinger_Harbinger-3_chr2_4407914'),
                      (21435, 'PIF-Harbinger_Harbinger-3N3_chr16_20723579'),
                      (21448, 'PIF-Harbinger_Harbinger-3N3_chr16_20723579')],
                     dtype=dtype_loci)),
        loci.Contig(
            loci.Header(reference='chr1',
                        strand='-',
                        category='PIF-Harbinger',
                        source='testA-2017-06-08.bam'),
            np.array([(21834, 'PIF-Harbinger_Harbinger-3N3_chr16_20723579'),
                      (21945, 'PIF-Harbinger_Harbinger-3N3_chr16_20723579'),
                      (21968, 'PIF-Harbinger_Harbinger-3N3_chr16_20723579'),
                      (21982, 'PIF-Harbinger_Harbinger-3N3_chr16_20723579')],
                     dtype=dtype_loci)),
        loci.Contig(
            loci.Header(reference='chr1',
                        strand='+',
                        category='NOT-A-FAMILY',
                        source='testA-2017-06-08.bam'),
            np.array([], dtype=dtype_loci)),
        loci.Contig(
            loci.Header(reference='chr1',
                        strand='-',
                        category='NOT-A-FAMILY',
                        source='testA-2017-06-08.bam'),
            np.array([], dtype=dtype_loci)))

    assert query == answer

Пример #14

0

Показать файл

def extract_informative_read_tips(bams,
                                  references,
                                  categories,
                                  quality=0,
                                  tag='ME'):
    """
    Extract the tips of 'informative' reads from one or more bam files.

    Informative reads are those that flank potential transposon
    insertions.
    The specific element (mate element) that each read is linked to
    should be stored using a sam tag which is 'ME' by default.
    Reads are categorised by transposon (super-)families by matching
    family names to the start of each reads mate-element name.

    :param bams: Path(s) to one or more bam files
    :type bams: str | list[str]
    :param references: Name(s) of one or more bam references
    :type references: str | list[str]
    :param categories: Name(s) of one or more transposon (super-)families
    :type categories: str | list[str]
    :param quality: Minimum mapping quality of reads
    :type quality: int
    :param tag: Sam tag containing each reads mate element name
    :type tag: str

    :return: A set of contigs of read tips categorised by reference,
        strand, category (family), and source (bam file name)
    :rtype: :class:`loci2.ContigSet`
    """
    if isinstance(bams, str):
        bams = [bams]

    if isinstance(references, str):
        references = [references]

    if isinstance(categories, str):
        categories = [categories]

    keys = product([ref.split(':')[0] for ref in references], ['+', '-'],
                   categories, [os.path.basename(bam) for bam in bams])
    dictionary = {loci.Header(*key): deque() for key in keys}

    for bam in bams:
        for reference in references:
            for read in _extract_bam_read_data(bam,
                                               reference,
                                               quality=quality,
                                               tags=[tag]):

                # match to a category
                category_matches = tuple(
                    filter(lambda x: read[tag].startswith(x), categories))

                # only include reads for specified categories
                if category_matches:

                    # longest matching category is the best category
                    category = max(category_matches, key=len)

                    # read header
                    header = loci.Header(reference=read['reference'],
                                         strand=read['strand'],
                                         category=category,
                                         source=read['source'])

                    # append loci data to que
                    tip = read['start'] if \
                        read['strand'] == '-' else \
                        read['stop']
                    dictionary[header].append((tip, read[tag]))

    dtype = np.dtype([('tip', np.int64), ('element', 'O')])
    return loci.ContigSet(*(loci.Contig(header, np.array(data, dtype=dtype))
                            for header, data in dictionary.items()))

Пример #15

0

Показать файл

def extract_gff_intervals(gff, references, categories):
    """
    Extract known transposon intervals from a gff anotation
    file.

    :param gff: Path to a gff file of transposon anotations
    :type gff: str
    :param references: Name(s) of one or more bam references
    :type references: str | list[str]
    :param categories: Name(s) of one or more transposon (super-)families
    :type categories: str | list[str]

    :return: A set of contigs of read tips categorised by reference,
        strand, category (family), and source (bam file name)
    :rtype: :class:`loci2.ContigSet`
    """
    if isinstance(references, str):
        references = [references]

    if isinstance(categories, str):
        categories = [categories]

    source = os.path.basename(gff)
    references = [reference.split(':')[0] for reference in references]

    keys = product(references, categories)
    dictionary = {
        loci.Header(reference=key[0], category=key[1], source=source): deque()
        for key in keys
    }

    with zopen(gff, 'rb') as infile:
        for line in infile:
            line = line.decode().split('\t')

            # match to reference:
            reference = decode_column(line[0])
            if reference in references:

                # match to a category
                feature_type = decode_column(line[2])
                category_matches = tuple(
                    filter(lambda x: feature_type.startswith(x), categories))

                # only include reads for specified categories
                if category_matches:

                    # longest matching category is the best category
                    category = max(category_matches, key=len)

                    header = loci.Header(reference=reference,
                                         category=category,
                                         source=source)

                    dictionary[header].append(
                        (int(line[3]), int(line[4]), feature_type))

    dtype = np.dtype([('start', np.int64), ('stop', np.int64),
                      ('element', '<O')])
    return loci.ContigSet(*(loci.Contig(header, np.array(data, dtype=dtype))
                            for header, data in dictionary.items()))

Пример #16

0

Показать файл

def extract_anchor_intervals(bams,
                             references,
                             known_transposons,
                             insert_size,
                             quality=0):
    """
    Extract 'anchor' read inserts from one or more bam files

    Anchor reads are paired reads in which neither has been mapped
    to a known transposon.
    The pair has then been mapped to a reference genome.
    Assuming that the insert size of the pair is smaller than the
    length of a transposon, the insert can be used to indicate a
    section of the samples genome in which there are no transposons
    on at least one allele.
    This can be used to infere heterozygousity of transposons
    insertions.

    Known transposon inserts from the reference genome are required for
    checking that anchor inserts overlapping these transposon are of
    a sensible length.

    Anchor reads are compressed to their interval unions for efficiency.

    :param bams: Path(s) to one or more bam files
    :type bams: str | list[str]
    :param references: Name(s) of one or more bam references
    :type references: str | list[str]
    :param known_transposons: Transposons known from the reference genome
    :type known_transposons: :class:`loci2.ContigSet`
    :param insert_size: Read pair insert size
    :type insert_size: int
    :param quality: Minimum maping quality of anchor reads
    :type quality: int

    :return: A set of contigs of unions of anchor inserts categorised
        by reference, strand, and source (bam file name)
    :rtype: :class:`loci2.ContigSet`
    """
    if isinstance(bams, str):
        bams = [bams]

    if isinstance(references, str):
        references = [references]

    # simplify known transposon headers for comparison
    known_transposons = known_transposons.map(lambda x: loci.mutate_header(
        x, strand='.', category=None, source=None),
                                              append_duplicate_headers=True)

    jobs = product(bams, references)
    dtype = np.dtype([('start', np.int64), ('stop', np.int64)])
    intervals = loci.ContigSet()

    for bam, reference in jobs:
        header = loci.Header(reference=reference.split(':')[0],
                             source=os.path.basename(bam),
                             strand='.')
        anchors = np.fromiter(_extract_bam_anchor_insert_data(bam,
                                                              reference,
                                                              quality=quality),
                              dtype=dtype)
        anchor_lengths = interval.lengths(anchors)

        # calculate lengths on known tranposons within each anchor interval
        reference_name = reference.split(':')[0]
        local_tes_header = loci.Header(reference=reference_name, strand='.')
        local_tes = known_transposons[local_tes_header]
        contained_te_lengths = interval.length_of_contains(
            anchors, local_tes.loci)

        # filter anchors based on insert size
        adjusted_anchor_lengths = anchor_lengths - contained_te_lengths
        anchors = anchors[adjusted_anchor_lengths <= insert_size]

        # use unions of filtered anchors as loci
        intervals.add(loci.unions(loci.Contig(header=header, loci=anchors)))

    return intervals

Пример #17

0

Показать файл

Файл: fingerprint.py Проект: wang-wldlsw/TEFingerprint

def pair_clusters(clusters, distance=0, use_known_elements=True):
    """
    Join matching clusters on opposite strands.

    Clusters of the same calgary are joined if they are within
    2 * distance of one another.
    Clusters may also be joined if they have both been matched
    to the same known element.

    Fields required in 'clusters':
        'start': int, 'stop': int, 'median': int,
        'known_element': str, 'ID': str

    Fields appended to return value:
        'pair' str


    :param clusters: a collection of cluster loci (intervals)
    :type clusters: :class:`loci.ContigSet`
    :param distance: the distance to search out from each cluster
    :type distance: int
    :param use_known_elements: specify whether to join pairs based on a
        common known element (default: True)
    :type use_known_elements: bool

    :return: a collection of cluster loci (intervals) with 'pair' field
    :rtype: :class:`loci.ContigSet`
    """
    joint_clusters = loci.ContigSet()

    dtype_join_data = np.dtype([("pair", "<O")])

    # new headers based on old but un-stranded
    new_headers = {h.mutate(strand='.') for h in clusters.headers()}

    for header in new_headers:
        # get forward and reverse loci for this key
        forward = clusters[header.mutate(strand='+')]
        reverse = clusters[header.mutate(strand='-')]

        # sort them into pairs based on median
        pairs = _cluster_pairer(forward,
                                reverse,
                                distance=distance,
                                use_known_elements=use_known_elements)

        # create arrays for the new data
        forward_join_data = np.empty(len(forward), dtype=dtype_join_data)
        forward_join_data["pair"] = '.'
        reverse_join_data = np.empty(len(reverse), dtype=dtype_join_data)
        reverse_join_data["pair"] = '.'
        for f, r in pairs:
            if f is not None and r is not None:
                forward_join_data[f]["pair"] = reverse.loci[r]["ID"]
                reverse_join_data[r]["pair"] = forward.loci[f]["ID"]
            else:
                pass

        # combine existing data with join data and add to new contig set
        joint_clusters.add(loci.Contig(header.mutate(strand='+'),
                                       util.numpy.array.bind(forward.loci,
                                                             forward_join_data)))
        joint_clusters.add(loci.Contig(header.mutate(strand='-'),
                                       util.numpy.array.bind(reverse.loci,
                                                             reverse_join_data)))

    return joint_clusters

Пример #18

0

Показать файл

Файл: test_loci.py Проект: wang-wldlsw/TEFingerprint

 def test_init_empty(self):
     """"""
     answer = loci.ContigSet()
     assert type(answer) == loci.ContigSet

Python ContigSet примеры использования