def test_contained(intervals, interval):
    start, stop = interval

    # Intervals completely contained by the query interval.
    contained = set((x, y) for x, y in intervals
                    if start <= x and y <= stop)

    # Pre-selection of intervals using binning.
    binned = set((x, y) for x, y in intervals
                 if binning.assign_bin(x, y)
                 in binning.contained_bins(start, stop))

    assert binned.issuperset(contained)
Exemplo n.º 2
0
    def list_view(cls, begin, count, region, queries=None, order=None):
        """
        Returns a collection of variants in the `variant_collection` field.
        """
        queries = queries or []

        # Todo: Document that `begin` and `end` are 1-based and inclusive. Or,
        #     perhaps we should change that to conform to BED track regions.
        try:
            chromosome, begin_position, end_position = normalize_region(
                region['chromosome'], region['begin'], region['end'])
        except ReferenceMismatch as e:
            raise ValidationError(str(e))

        for query in queries:
            if query.singleton:
                query.require_active = False
                query.require_coverage_profile = False
            _authorize_query(query)

        # Set of samples IDs considered by all queries together.
        all_sample_ids = {sample.id
                          for query in queries
                          for sample in query.samples}

        # Set of observations considered by all queries together.
        bins = binning.contained_bins(begin_position - 1, end_position)
        observations = Observation.query.filter(
            Observation.chromosome == chromosome,
            Observation.position >= begin_position,
            Observation.position <= end_position,
            Observation.bin.in_(bins)
        ).join(Variation).join(Sample).filter(
            Sample.id.in_(all_sample_ids)
        ).distinct(
            Observation.chromosome,
            Observation.position,
            Observation.reference,
            Observation.observed
        ).order_by(
            *[getattr(getattr(Observation, f), d)()
                                               for f, d in cls.get_order(order)])

        items = [cls.serialize((o.chromosome, o.position, o.reference, o.observed),
                               queries=queries)
                 for o in observations.limit(count).offset(begin)]
        return (observations.count(),
                jsonify(variant_collection={'uri': cls.collection_uri(),
                                            'items': items}))
Exemplo n.º 3
0
def test_contained_bins(start, stop, expected):
    assert binning.contained_bins(start, stop) == expected
Exemplo n.º 4
0
Arquivo: tasks.py Projeto: varda/varda
def annotate_regions(original_regions, annotated_variants,
                     original_filetype='bed', annotated_filetype='csv',
                     queries=None, original_records=1):
    """
    Read regions from a file and write variant frequencies to another file.

    :arg original_regions: Open handle to a file with regions.
    :type original_regions: file-like object
    :arg annotated_variants: Open handle to write annotated variants to.
    :type annotated_vairants: file-like object
    :kwarg original_filetype: Filetype for variants (currently only ``bed``
        allowed).
    :type original_filetype: str
    :kwarg annotated_filetype: Filetype for annotated variants (currently only
        ``csv`` allowed).
    :type annotated_filetype: str
    :arg queries: List of sample queries to compute frequencies over.
    :type queries: list of Query
    :arg original_records: Number of records in original regions file.
    :type original_records: int

    The output file contains the following columns for information on each
    variant:

    - ``CHROMOSOME``: Chromosome name in the reference genome.
    - ``POSITION``: One-based position of ``REFERENCE`` and ``OBSERVED`` on
      ``CHROMOSOME``.
    - ``REFERENCE``: Reference allele.
    - ``OBSERVED``: Observed (alternate) allele.

    Frequency information is annotated using several additional columns in the
    output file. For each query, we use the following columns, where the
    ``<Q>`` prefix is the query name:

    - ``<Q>_VN``: For each alternate allele, the number of individuals used
      for calculating ``<Q>_VF``, i.e., the number of individuals that have
      this region covered.
    - ``<Q>_VF``: For each alternate allele, the observed frequency, i.e., the
      ratio of individuals in which the allele was observed.
    - ``<Q>_VF_HET``: For each alternate allele, the observed heterozygous
      frequency, i.e., the ratio of individuals in which the allele was
      observed heterozygous.
    - ``<Q>_VF_HOM``: For each alternate allele, the observed homozygous
      frequency, i.e., the ratio of individuals in which the allele was
      observed homozygous.

    Note that the ``<Q>_VF_HET`` and ``<Q>_VF_HOM`` values for a particular
    alternate allele might not add up to the ``<Q>_VF`` value, since there can
    be observations where the exact genotype is unknown.

    If the query specifies exactly one sample and that sample does not have
    coverage information, ``<Q>_VN`` is simply the number of individuals
    contained in the sample.
    """
    queries = queries or []

    if original_filetype != 'bed':
        raise ReadError('Original data must be in BED format')

    if annotated_filetype != 'csv':
        raise ReadError('Annotated data must be in CSV format')

    # Set of samples IDs that are considered by all queries together.
    all_sample_ids = {sample.id
                      for query in queries
                      for sample in query.samples}

    header_fields = ['CHROMOSOME', 'POSITION', 'REFERENCE', 'OBSERVED']

    # Header lines in CSV output for each query.
    for query in queries:
        header_fields.extend([query.name + '_VN', query.name + '_VF',
                              query.name + '_VF_HET', query.name + '_VF_HOM'])
        description = ('Number of individuals in %s having this region covered'
                       % query.name)
        if not query.require_coverage_profile:
            description += ' (or without coverage profile)'
        description += ' (out of %i considered).' % sum(sample.pool_size for
                                                        sample in query.samples)
        # TODO: If it is a singleton query, removing the "... having this
        # region covered ..." part.
        annotated_variants.write(
            '##' + query.name + '_VN: %s.\n' % description)
        annotated_variants.write(
            '##' + query.name + '_VF: Ratio of individuals in %s in which the '
            'allele was observed.\n' % query.name)
        annotated_variants.write(
            '##' + query.name + '_VF_HET: Ratio of individuals in %s in which the '
            'allele was observed as heterozygous.\n' % query.name)
        annotated_variants.write(
            '##' + query.name + '_VF_HOM: Ratio of individuals in %s in which the '
            'allele was observed as homozygous.\n' % query.name)

    annotated_variants.write('#' + '\t'.join(header_fields) + '\n')

    old_percentage = -1
    for current_record, chromosome, begin, end in read_regions(original_regions):
        percentage = min(int(current_record / original_records * 100), 99)
        if percentage > old_percentage:
            # Todo: Task state updating should be defined in the task itself,
            #     perhaps we can give values using a callback.
            try:
                current_task.update_state(state='PROGRESS',
                                          meta={'percentage': percentage})
            except AttributeError:
                # Hack for the unit tests were whe call this not from within
                # a task.
                pass
            old_percentage = percentage

        results = [[] for _ in queries]

        # Set of observations considered by all queries together.
        bins = binning.contained_bins(begin - 1, end)
        observations = Observation.query.filter(
            Observation.chromosome == chromosome,
            Observation.position >= begin,
            Observation.position <= end,
            Observation.bin.in_(bins)
        ).join(Variation).join(Sample).filter(
            Sample.id.in_(all_sample_ids)
        ).distinct(
            Observation.chromosome,
            Observation.position,
            Observation.reference,
            Observation.observed
        ).order_by(
            Observation.chromosome,
            Observation.position,
            Observation.reference,
            Observation.observed,
            Observation.id
        )

        for observation in observations:
            fields = [observation.chromosome, observation.position,
                      observation.reference, observation.observed]

            for query in enumerate(queries):
                vn, vf = calculate_frequency(observation.chromosome,
                                             observation.position,
                                             observation.reference,
                                             observation.observed,
                                             samples=query.samples)
                fields.extend([vn, sum(vf.values()), vf['heterozygous'],
                               vf['homozygous']])

            # Todo: Stringify per value, not in one sweep.
            annotated_variants.write('\t'.join(str(f) for f in fields) + '\n')