예제 #1
0
파일: misc.py 프로젝트: tpoterba/hail
def rename_duplicates(dataset, name='unique_id') -> MatrixTable:
    """Rename duplicate column keys.

    .. include:: ../_templates/req_tstring.rst

    Examples
    --------

    >>> renamed = hl.rename_duplicates(dataset).cols()
    >>> duplicate_samples = (renamed.filter(renamed.s != renamed.unique_id)
    ...                             .select()
    ...                             .collect())

    Notes
    -----

    This method produces a new column field from the string column key by
    appending a unique suffix ``_N`` as necessary. For example, if the column
    key "NA12878" appears three times in the dataset, the first will produce
    "NA12878", the second will produce "NA12878_1", and the third will produce
    "NA12878_2". The name of this new field is parameterized by `name`.

    Parameters
    ----------
    dataset : :class:`.MatrixTable`
        Dataset.
    name : :obj:`str`
        Name of new field.

    Returns
    -------
    :class:`.MatrixTable`
    """

    require_col_key_str(dataset, 'rename_duplicates')
    ids = dataset.col_key[0].collect()
    uniques = set()
    mapping = []
    new_ids = []

    fmt = lambda s, i: '{}_{}'.format(s, i)
    for s in ids:
        s_ = s
        i = 0
        while s_ in uniques:
            i += 1
            s_ = fmt(s, i)

        if s_ != s:
            mapping.append((s, s_))
        uniques.add(s_)
        new_ids.append(s_)

    if mapping:
        info(f'Renamed {len(mapping)} duplicate {plural("sample ID", len(mapping))}. Mangled IDs as follows:' +
             ''.join(f'\n  "{pre}" => "{post}"' for pre, post in mapping))
    else:
        info('No duplicate sample IDs found.')
    uid = Env.get_uid()
    return dataset.annotate_cols(**{name: hl.literal(new_ids)[hl.int(hl.scan.count())]})
예제 #2
0
def hwe_normalize(call_expr):
    mt = matrix_table_source('hwe_normalize/call_expr', call_expr)
    mt = mt.select_entries(__gt=call_expr.n_alt_alleles())
    mt = mt.annotate_rows(__AC=agg.sum(mt.__gt),
                          __n_called=agg.count_where(hl.is_defined(mt.__gt)))
    mt = mt.filter_rows((mt.__AC > 0) & (mt.__AC < 2 * mt.__n_called))

    n_variants = mt.count_rows()
    if n_variants == 0:
        raise FatalError(
            "hwe_normalize: found 0 variants after filtering out monomorphic sites."
        )
    info(
        f"hwe_normalize: found {n_variants} variants after filtering out monomorphic sites."
    )

    mt = mt.annotate_rows(__mean_gt=mt.__AC / mt.__n_called)
    mt = mt.annotate_rows(__hwe_scaled_std_dev=hl.sqrt(mt.__mean_gt *
                                                       (2 - mt.__mean_gt) *
                                                       n_variants / 2))
    mt = mt.unfilter_entries()

    normalized_gt = hl.or_else(
        (mt.__gt - mt.__mean_gt) / mt.__hwe_scaled_std_dev, 0.0)
    return normalized_gt
예제 #3
0
    def plan(self, n_inputs: int) -> CombinerPlan:
        assert n_inputs > 0

        def int_ceil(x):
            return int(math.ceil(x))

        tree_height = int_ceil(math.log(n_inputs, self.branch_factor))
        phases: List[Phase] = []
        file_size: List[List[int]] = []  # List of file size per phase

        file_size.append([1 for _ in range(n_inputs)])
        while len(file_size[-1]) > 1:
            last_stage_files = file_size[-1]
            n = len(last_stage_files)
            i = 0
            jobs = []
            while (i < n):
                job = []
                job_i = 0
                while job_i < self.batch_size and i < n:
                    merge = []
                    merge_i = 0
                    merge_size = 0
                    while merge_i < self.branch_factor and i < n:
                        merge_size += last_stage_files[i]
                        merge.append(i)
                        merge_i += 1
                        i += 1
                    job.append(Merge(merge, merge_size))
                    job_i += 1
                jobs.append(Job(job))
            file_size.append([
                merge.input_total_size for job in jobs for merge in job.merges
            ])
            phases.append(Phase(jobs))

        assert len(phases) == tree_height
        for layer in file_size:
            assert sum(layer) == n_inputs

        phase_strs = []
        total_jobs = 0
        for i, phase in enumerate(phases):
            n = len(phase.jobs)
            job_str = hl.utils.misc.plural('job', n)
            n_files_produced = len(file_size[i + 1])
            adjective = 'final' if n_files_produced == 1 else 'intermediate'
            file_str = hl.utils.misc.plural('file', n_files_produced)
            phase_strs.append(
                f'\n        Phase {i + 1}: {n} {job_str} corresponding to {n_files_produced} {adjective} output {file_str}.'
            )
            total_jobs += n

        info(
            f"GVCF combiner plan:\n"
            f"    Branch factor: {self.branch_factor}\n"
            f"    Batch size: {self.batch_size}\n"
            f"    Combining {n_inputs} input files in {tree_height} phases with {total_jobs} total jobs.{''.join(phase_strs)}\n"
        )
        return CombinerPlan(file_size, phases)
예제 #4
0
    def hailBlanczos(A, G, k, q, compute_U=False):

        G_i = hl.nd.qr(G)[0]
        g_list = [G_i]

        for j in range(0, q):
            info(f"blanczos_pca: Beginning iteration {j+1}/{q}")
            G_i = A.aggregate(hl.agg.ndarray_sum(
                A.ndarray.T @ (A.ndarray @ G_i)),
                              _localize=False)
            G_i = hl.nd.qr(G_i)[0]._persist()
            g_list.append(G_i)

        info("blanczos_pca: Iterations complete. Computing local QR")
        G = hl.nd.hstack(g_list)
        V = hl.nd.qr(G)[0]._persist()

        AV = A.select(ndarray=A.ndarray @ V)

        if compute_U:
            AV_local = hl.nd.vstack(AV.aggregate(hl.agg.collect(AV.ndarray)))
            U, R = hl.nd.qr(AV_local)._persist()
            return U, R, V
        else:
            Rs = hl.nd.vstack(
                AV.aggregate(hl.agg.collect(hl.nd.qr(AV.ndarray)[1])))
            R = hl.nd.qr(Rs)[1]._persist()
            return R, V
예제 #5
0
파일: misc.py 프로젝트: zhouhufeng/hail
def rename_duplicates(dataset, name='unique_id') -> MatrixTable:
    """Rename duplicate column keys.

    .. include:: ../_templates/req_tstring.rst

    Examples
    --------

    >>> renamed = hl.rename_duplicates(dataset).cols()
    >>> duplicate_samples = (renamed.filter(renamed.s != renamed.unique_id)
    ...                             .select()
    ...                             .collect())

    Notes
    -----

    This method produces a new column field from the string column key by
    appending a unique suffix ``_N`` as necessary. For example, if the column
    key "NA12878" appears three times in the dataset, the first will produce
    "NA12878", the second will produce "NA12878_1", and the third will produce
    "NA12878_2". The name of this new field is parameterized by `name`.

    Parameters
    ----------
    dataset : :class:`.MatrixTable`
        Dataset.
    name : :obj:`str`
        Name of new field.

    Returns
    -------
    :class:`.MatrixTable`
    """

    require_col_key_str(dataset, 'rename_duplicates')
    ids = dataset.col_key[0].collect()
    uniques = set()
    mapping = []
    new_ids = []

    fmt = lambda s, i: '{}_{}'.format(s, i)
    for s in ids:
        s_ = s
        i = 0
        while s_ in uniques:
            i += 1
            s_ = fmt(s, i)

        if s_ != s:
            mapping.append((s, s_))
        uniques.add(s_)
        new_ids.append(s_)

    if mapping:
        info(f'Renamed {len(mapping)} duplicate {plural("sample ID", len(mapping))}. Mangled IDs as follows:' +
             ''.join(f'\n  "{pre}" => "{post}"' for pre, post in mapping))
    else:
        info('No duplicate sample IDs found.')
    uid = Env.get_uid()
    return dataset.annotate_cols(**{name: hl.literal(new_ids)[hl.int(hl.scan.count())]})
    def _step_vdses(self):
        current_bin = original_bin = min(self.vdses)
        files_to_merge = self.vdses[current_bin][:self.branch_factor]
        if len(files_to_merge) == len(self.vdses[current_bin]):
            del self.vdses[current_bin]
        else:
            self.vdses[current_bin] = self.vdses[current_bin][self.
                                                              branch_factor:]

        remaining = self.branch_factor - len(files_to_merge)
        while self._num_vdses > 0 and remaining > 0:
            current_bin = min(self.vdses)
            extra = self.vdses[current_bin][-remaining:]
            if len(extra) == len(self.vdses[current_bin]):
                del self.vdses[current_bin]
            else:
                self.vdses[current_bin] = self.vdses[current_bin][:-remaining]
            files_to_merge = extra + files_to_merge
            remaining = self.branch_factor - len(files_to_merge)

        new_n_samples = sum(f.n_samples for f in files_to_merge)
        info(
            f'VDS Combine (job {self._job_id}): merging {len(files_to_merge)} datasets with {new_n_samples} samples'
        )

        temp_path = self._temp_out_path(f'vds-combine_job{self._job_id}')
        largest_vds = max(files_to_merge, key=lambda vds: vds.n_samples)
        vds = hl.vds.read_vds(largest_vds.path)

        interval_bin = floor(log(new_n_samples, self.branch_factor))
        intervals, intervals_dtype = self.__intervals_cache.get(
            interval_bin, (None, None))

        if intervals is None:
            # we use the reference data since it generally has more rows than the variant data
            intervals, intervals_dtype = calculate_new_intervals(
                vds.reference_data, self.target_records,
                os.path.join(temp_path, 'interval_checkpoint.ht'))
            self.__intervals_cache[interval_bin] = (intervals, intervals_dtype)

        paths = [f.path for f in files_to_merge]
        vdss = read_variant_datasets(paths, intervals, intervals_dtype)
        combined = combine_variant_datasets(vdss)

        if self.finished:
            combined.write(self.output_path)
            return

        new_path = os.path.join(temp_path, 'dataset.vds')
        combined.write(new_path, overwrite=True)
        new_bin = floor(log(new_n_samples, self.branch_factor))
        # this ensures that we don't somehow stick a vds at the end of
        # the same bin, ending up with a weird ordering issue
        if new_bin <= original_bin:
            new_bin = original_bin + 1
        self.vdses[new_bin].append(
            VDSMetadata(path=new_path, n_samples=new_n_samples))
예제 #7
0
    def _step_gvcfs(self):
        step = self.branch_factor
        files_to_merge = self.gvcfs[:self.gvcf_batch_size * step]
        self.gvcfs = self.gvcfs[self.gvcf_batch_size * step:]

        info(
            f'GVCF combine (job {self._job_id}): merging {len(files_to_merge)} GVCFs into '
            f'{(len(files_to_merge) + step - 1) // step} datasets')

        if self.gvcf_external_header is not None:
            sample_names = self.gvcf_sample_names[:self.gvcf_batch_size * step]
            self.gvcf_sample_names = self.gvcf_sample_names[self.
                                                            gvcf_batch_size *
                                                            step:]
        else:
            sample_names = None
        merge_vds = []
        merge_n_samples = []
        vcfs = [
            transform_gvcf(vcf,
                           reference_entry_fields_to_keep=self.
                           gvcf_reference_entry_fields_to_keep,
                           info_to_keep=self.gvcf_info_to_keep)
            for vcf in hl.import_gvcfs(
                files_to_merge,
                self.gvcf_import_intervals,
                array_elements_required=False,
                _external_header=self.gvcf_external_header,
                _external_sample_ids=[[name] for name in sample_names]
                if sample_names is not None else None,
                reference_genome=self.reference_genome,
                contig_recoding=self.contig_recoding)
        ]
        while vcfs:
            merging, vcfs = vcfs[:step], vcfs[step:]
            merge_vds.append(combine_variant_datasets(merging))
            merge_n_samples.append(len(merging))
        if self.finished and len(merge_vds) == 1:
            merge_vds[0].write(self.output_path)
            return

        temp_path = self._temp_out_path(
            f'gvcf-combine_job{self._job_id}/dataset_')
        pad = len(str(len(merge_vds) - 1))
        merge_metadata = [
            VDSMetadata(path=temp_path + str(count).rjust(pad, '0') + '.vds',
                        n_samples=n_samples)
            for count, n_samples in enumerate(merge_n_samples)
        ]
        paths = [md.path for md in merge_metadata]
        hl.vds.write_variant_datasets(merge_vds,
                                      paths,
                                      overwrite=True,
                                      codec_spec=FAST_CODEC_SPEC)
        for md in merge_metadata:
            self.vdses[max(1, floor(log(md.n_samples,
                                        self.branch_factor)))].append(md)
예제 #8
0
def value_type():
    code = flask.request.json
    info(f'value type: {code}')
    try:
        jir = Env.hail().expr.ir.IRParser.parse_value_ir(code, {}, {})
        result = jir.typ().toString()
        info(f'result: {result}')
        return flask.jsonify(result)
    except FatalError as e:
        return flask.jsonify({'message': e.args[0]}), 400
예제 #9
0
 def copy_log(self, path: str) -> None:
     log = Env.hc()._log
     try:
         if self.is_dir(path):
             _, tail = os.path.split(log)
             path = os.path.join(path, tail)
         info(f"copying log to {repr(path)}...")
         self.copy(local_path_uri(Env.hc()._log), path)
     except Exception as e:
         sys.stderr.write(f'Could not copy log: encountered error:\n  {e}')
예제 #10
0
파일: fs.py 프로젝트: jigold/hail
 def copy_log(self, path: str) -> None:
     log = Env.hc()._log
     try:
         if self.is_dir(path):
             _, tail = os.path.split(log)
             path = os.path.join(path, tail)
         info(f"copying log to {repr(path)}...")
         self.copy(local_path_uri(Env.hc()._log), path)
     except Exception as e:
         sys.stderr.write(f'Could not copy log: encountered error:\n  {e}')
예제 #11
0
def value_type():
    code = flask.request.json

    info(f'value type: {code}')

    jir = Env.hail().expr.ir.IRParser.parse_value_ir(code, {}, {})
    result = jir.typ().toString()

    info(f'result: {result}')

    return flask.jsonify(result)
예제 #12
0
파일: apiserver.py 프로젝트: jigold/hail
async def value_type(request):
    code = await request.json()
    info(f'value type: {code}')
    try:
        result = await run(blocking_value_type, code)
        info(f'result: {result}')
        return web.json_response(result)
    except FatalError as e:
        return web.json_response({
            'message': e.args[0]
        }, status=400)
예제 #13
0
async def table_type(request, userdata):
    code = await request.json()
    info(f'table type: {code}')
    try:
        result = await run(blocking_table_type, code)
        info(f'result: {result}')
        return web.json_response(result)
    except FatalError as e:
        return web.json_response({
            'message': e.args[0]
        }, status=400)
예제 #14
0
def get_gene_intervals(gene_symbols=None, gene_ids=None, transcript_ids=None,
                       verbose=True, reference_genome=None, gtf_file=None):
    """Get intervals of genes or transcripts.

    Get the boundaries of genes or transcripts from a GTF file, for quick filtering of a Table or MatrixTable.

    On Google Cloud platform:
    Gencode v19 (GRCh37) GTF available at: gs://hail-common/references/gencode/gencode.v19.annotation.gtf.bgz
    Gencode v29 (GRCh38) GTF available at: gs://hail-common/references/gencode/gencode.v29.annotation.gtf.bgz

    Example
    -------
    >>> hl.filter_intervals(ht, get_gene_intervals(gene_symbols=['PCSK9'], reference_genome='GRCh37'))  # doctest: +SKIP

    Parameters
    ----------

    gene_symbols : :obj:`list` of :obj:`str`, optional
       Gene symbols (e.g. PCSK9).
    gene_ids : :obj:`list` of :obj:`str`, optional
       Gene IDs (e.g. ENSG00000223972).
    transcript_ids : :obj:`list` of :obj:`str`, optional
       Transcript IDs (e.g. ENSG00000223972).
    verbose : :obj:`bool`
       If ``True``, print which genes and transcripts were matched in the GTF file.
    reference_genome : :obj:`str` or :class:`.ReferenceGenome`, optional
       Reference genome to use (passed along to import_gtf).
    gtf_file : :obj:`str`
       GTF file to load. If none is provided, but `reference_genome` is one of
       `GRCh37` or `GRCh38`, a default will be used (on Google Cloud Platform).

    Returns
    -------
    :obj:`list` of :class:`.Interval`
    """
    if gene_symbols is None and gene_ids is None and transcript_ids is None:
        raise ValueError('get_gene_intervals requires at least one of gene_symbols, gene_ids, or transcript_ids')
    ht = _load_gencode_gtf(gtf_file, reference_genome)
    criteria = []
    if gene_symbols:
        criteria.append(hl.any(lambda y: (ht.feature == 'gene') & (ht.gene_name == y), gene_symbols))
    if gene_ids:
        criteria.append(hl.any(lambda y: (ht.feature == 'gene') & (ht.gene_id == y.split('\\.')[0]), gene_ids))
    if transcript_ids:
        criteria.append(hl.any(lambda y: (ht.feature == 'transcript') & (ht.transcript_id == y.split('\\.')[0]), transcript_ids))

    ht = ht.filter(functools.reduce(operator.ior, criteria))
    gene_info = ht.aggregate(hl.agg.collect((ht.feature, ht.gene_name, ht.gene_id, ht.transcript_id, ht.interval)))
    if verbose:
        info(f'get_gene_intervals found {len(gene_info)} entries:\n'
             + "\n".join(map(lambda x: f'{x[0]}: {x[1]} ({x[2] if x[0] == "gene" else x[3]})', gene_info)))
    intervals = list(map(lambda x: x[-1], gene_info))
    return intervals
예제 #15
0
def execute():
    code = flask.request.json
    info(f'execute: {code}')
    try:
        jir = Env.hail().expr.ir.IRParser.parse_value_ir(code, {}, {})
        typ = hl.dtype(jir.typ().toString())
        value = Env.hail().backend.spark.SparkBackend.executeJSON(jir)
        result = {'type': str(typ), 'value': value}
        info(f'result: {result}')
        return flask.jsonify(result)
    except FatalError as e:
        return flask.jsonify({'message': e.args[0]}), 400
예제 #16
0
def table_type():
    code = flask.request.json

    info(f'table type: {code}')

    jir = Env.hail().expr.ir.IRParser.parse_table_ir(code, {}, {})
    ttyp = hl.ttable._from_java(jir.typ())
    
    result = {'global': str(ttyp.global_type),
              'row': str(ttyp.row_type),
              'row_key': ttyp.row_key}
    
    info(f'result: {result}')

    return flask.jsonify(result)
예제 #17
0
def execute():
    code = flask.request.json

    info(f'execute: {code}')

    jir = Env.hail().expr.ir.IRParser.parse_value_ir(code, {}, {})

    typ = hl.dtype(jir.typ().toString())
    value = Env.hail().expr.ir.Interpret.interpretJSON(jir)

    result = {'type': str(typ), 'value': value}

    info(f'result: {result}')

    return flask.jsonify(result)
예제 #18
0
def table_type():
    code = flask.request.json
    info(f'table type: {code}')
    try:
        jir = Env.hail().expr.ir.IRParser.parse_table_ir(code, {}, {})
        ttyp = hl.ttable._from_java(jir.typ())
        result = {
            'global': str(ttyp.global_type),
            'row': str(ttyp.row_type),
            'row_key': ttyp.row_key
        }
        info(f'result: {result}')
        return flask.jsonify(result)
    except FatalError as e:
        return flask.jsonify({'message': e.args[0]}), 400
예제 #19
0
def blockmatrix_type():
    code = flask.request.json
    info(f'blockmatrix type: {code}')
    try:
        jir = Env.hail().expr.ir.IRParser.parse_blockmatrix_ir(code, {}, {})
        bmtyp = hl.tblockmatrix._from_java(jir.typ())
        result = {
            'element_type': str(bmtyp.element_type),
            'shape': bmtyp.shape,
            'is_row_vector': bmtyp.is_row_vector,
            'block_size': bmtyp.block_size
        }
        info(f'result: {result}')
        return flask.jsonify(result)
    except FatalError as e:
        return flask.jsonify({'message': e.args[0]}), 400
예제 #20
0
def matrix_type():
    code = flask.request.json
    info(f'matrix type: {code}')
    try:
        jir = Env.hail().expr.ir.IRParser.parse_matrix_ir(code, {}, {})
        mtyp = hl.tmatrix._from_java(jir.typ())
        result = {
            'global': str(mtyp.global_type),
            'col': str(mtyp.col_type),
            'col_key': mtyp.col_key,
            'row': str(mtyp.row_type),
            'row_key': mtyp.row_key,
            'entry': str(mtyp.entry_type)
        }
        info(f'result: {result}')
        return flask.jsonify(result)
    except FatalError as e:
        return flask.jsonify({'message': e.args[0]}), 400
예제 #21
0
def execute():
    code = flask.request.json
    
    info(f'execute: {code}')
    
    jir = Env.hail().expr.Parser.parse_value_ir(code, {}, {})
    
    typ = hl.HailType._from_java(jir.typ())
    value = Env.hail().expr.ir.Interpret.interpretPyIR(code, {}, {})

    result = {
        'type': str(typ),
        'value': value
    }
    
    info(f'result: {result}')
    
    return flask.jsonify(result)
예제 #22
0
def execute():
    code = flask.request.json
    
    info(f'execute: {code}')
    
    jir = Env.hail().expr.ir.IRParser.parse_value_ir(code, {}, {})
    
    typ = hl.HailType._from_java(jir.typ())
    value = Env.hail().expr.ir.Interpret.interpretPyIR(code, {}, {})

    result = {
        'type': str(typ),
        'value': value
    }
    
    info(f'result: {result}')
    
    return flask.jsonify(result)
    def run(self):
        flagname = 'no_ir_logging'
        prev_flag_value = hl._get_flags(flagname).get(flagname)
        hl._set_flags(**{flagname: '1'})

        vds_samples = sum(vds.n_samples for vdses in self.vdses.values()
                          for vds in vdses)
        info(
            'Running VDS combiner:\n'
            f'    VDS arguments: {self._num_vdses} datasets with {vds_samples} samples\n'
            f'    GVCF arguments: {len(self.gvcfs)} inputs/samples\n'
            f'    Branch factor: {self.branch_factor}\n'
            f'    GVCF merge batch size: {self.gvcf_batch_size}')
        while not self.finished:
            self.save()
            self.step()
        self.save()
        info('Finished VDS combiner!')
        hl._set_flags(**{flagname: prev_flag_value})
예제 #24
0
def _reduced_svd(A: TallSkinnyMatrix,
                 k=10,
                 compute_U=False,
                 iterations=2,
                 iteration_size=None):
    # Set Parameters
    q = iterations
    if iteration_size is None:
        L = k + 2
    else:
        L = iteration_size
    assert ((q + 1) * L >= k)
    n = A.ncols

    # Generate random matrix G
    G = hl.nd.zeros((n, L)).map(lambda n: hl.rand_norm(0, 1))
    G = hl.nd.qr(G)[0]._persist()

    fact = _krylov_factorization(A, G, q, compute_U)
    info("_reduced_svd: Computing local SVD")
    return fact.reduced_svd(k)
예제 #25
0
def _make_tsm_from_call(call_expr,
                        block_size,
                        mean_center=False,
                        hwe_normalize=False):
    mt = matrix_table_source('_make_tsm/entry_expr', call_expr)
    mt = mt.select_entries(__gt=call_expr.n_alt_alleles())
    if mean_center or hwe_normalize:
        mt = mt.annotate_rows(__AC=agg.sum(mt.__gt),
                              __n_called=agg.count_where(hl.is_defined(
                                  mt.__gt)))
        mt = mt.filter_rows((mt.__AC > 0) & (mt.__AC < 2 * mt.__n_called))

        n_variants = mt.count_rows()
        if n_variants == 0:
            raise FatalError(
                "_make_tsm: found 0 variants after filtering out monomorphic sites."
            )
        info(
            f"_make_tsm: found {n_variants} variants after filtering out monomorphic sites."
        )

        mt = mt.annotate_rows(__mean_gt=mt.__AC / mt.__n_called)
        mt = mt.unfilter_entries()

        mt = mt.select_entries(__x=hl.or_else(mt.__gt - mt.__mean_gt, 0.0))

        if hwe_normalize:
            mt = mt.annotate_rows(
                __hwe_scaled_std_dev=hl.sqrt(mt.__mean_gt *
                                             (2 - mt.__mean_gt) * n_variants /
                                             2))
            mt = mt.select_entries(__x=mt.__x / mt.__hwe_scaled_std_dev)
    else:
        mt = mt.select_entries(__x=mt.__gt)

    A, ht = mt_to_table_of_ndarray(mt.__x,
                                   block_size,
                                   return_checkpointed_table_also=True)
    A = A.persist()
    return TallSkinnyMatrix(A, A.ndarray, ht, list(mt.col_key))
예제 #26
0
파일: hadoop_utils.py 프로젝트: bcajes/hail
def copy_log(path: str) -> None:
    """Attempt to copy the session log to a hadoop-API-compatible location.

    Examples
    --------
    Specify a manual path:

    >>> hl.copy_log('gs://my-bucket/analysis-10-jan19.log')  # doctest: +SKIP
    INFO: copying log to 'gs://my-bucket/analysis-10-jan19.log'...

    Copy to a directory:

    >>> hl.copy_log('gs://my-bucket/')  # doctest: +SKIP
    INFO: copying log to 'gs://my-bucket/hail-20180924-2018-devel-46e5fad57524.log'...

    Notes
    -----
    Since Hail cannot currently log directly to distributed file systems, this
    function is provided as a utility for offloading logs from ephemeral nodes.

    If `path` is a directory, then the log file will be copied using its
    base name to the directory (e.g. ``/home/hail.log`` would be copied as
    ``gs://my-bucket/hail.log`` if `path` is ``gs://my-bucket``.

    Parameters
    ----------
    path: :obj:`str`
    """
    log = Env.hc()._log
    try:
        if hadoop_is_dir(path):
            _, tail = os.path.split(log)
            path = os.path.join(path, tail)
        info(f"copying log to {repr(path)}...")
        hadoop_copy(local_path_uri(Env.hc()._log), path)
    except Exception as e:
        sys.stderr.write(f'Could not copy log: encountered error:\n  {e}')
예제 #27
0
def copy_log(path: str) -> None:
    """Attempt to copy the session log to a hadoop-API-compatible location.

    Examples
    --------
    Specify a manual path:

    >>> hl.copy_log('gs://my-bucket/analysis-10-jan19.log')  # DOCTEST: +SKIP
    INFO: copying log to 'gs://my-bucket/analysis-10-jan19.log'...

    Copy to a directory:

    >>> hl.copy_log('gs://my-bucket/')  # DOCTEST: +SKIP
    INFO: copying log to 'gs://my-bucket/hail-20180924-2018-devel-46e5fad57524.log'...

    Notes
    -----
    Since Hail cannot currently log directly to distributed file systems, this
    function is provided as a utility for offloading logs from ephemeral nodes.

    If `path` is a directory, then the log file will be copied using its
    base name to the directory (e.g. ``/home/hail.log`` would be copied as
    ``gs://my-bucket/hail.log`` if `path` is ``gs://my-bucket``.

    Parameters
    ----------
    path: :obj:`str`
    """
    log = Env.hc()._log
    try:
        if hadoop_is_dir(path):
            _, tail = os.path.split(log)
            path = os.path.join(path, tail)
        info(f"copying log to {repr(path)}...")
        hadoop_copy(local_path_uri(Env.hc()._log), path)
    except Exception as e:
        sys.stderr.write(f'Could not copy log: encountered error:\n  {e}')
예제 #28
0
파일: pca.py 프로젝트: MariusDanner/hail
    def hailBlanczos(A, G, k, q):

        h_list = []
        G_i = hl.nd.qr(G)[0]

        for j in range(0, q):
            info(f"blanczos_pca: Beginning iteration {j + 1}/{q+1}")
            temp = A.annotate(H_i=A.ndarray @ G_i)
            temp = temp.annotate(G_i_intermediate=temp.ndarray.T @ temp.H_i)
            result = temp.aggregate(hl.struct(
                Hi_chunks=hl.agg.collect(temp.H_i),
                G_i=hl.agg.ndarray_sum(temp.G_i_intermediate)),
                                    _localize=False)._persist()
            localized_H_i = hl.nd.vstack(result.Hi_chunks)
            h_list.append(localized_H_i)
            G_i = hl.nd.qr(result.G_i)[0]

        info(f"blanczos_pca: Beginning iteration {q+ 1}/{q+1}")
        temp = A.annotate(H_i=A.ndarray @ G_i)
        result = temp.aggregate(hl.agg.collect(temp.H_i),
                                _localize=False)._persist()
        info("blanczos_pca: Iterations complete. Computing local QR")
        localized_H_i = hl.nd.vstack(result)
        h_list.append(localized_H_i)
        H = hl.nd.hstack(h_list)
        Q = hl.nd.qr(H)[0]._persist()
        A = A.annotate(part_size=A.ndarray.shape[0])
        A = A.annotate(rows_preceeding=hl.int32(hl.scan.sum(A.part_size)))
        A = A.annotate_globals(Qt=Q.T)
        T = A.annotate(ndarray=A.Qt[:, A.rows_preceeding:A.rows_preceeding +
                                    A.part_size] @ A.ndarray)
        arr_T = T.aggregate(hl.agg.ndarray_sum(T.ndarray), _localize=False)

        info("blanczos_pca: QR Complete. Computing local SVD")
        U, S, W = hl.nd.svd(arr_T, full_matrices=False)._persist()

        V = Q @ U

        truncV = V[:, :k]
        truncS = S[:k]
        truncW = W[:k, :]

        return truncV, truncS, truncW
예제 #29
0
def multi_way_union_mts(mts: list, tmp_dir: str,
                        chunk_size: int) -> hl.MatrixTable:
    """Joins MatrixTables in the provided list

    :param list mts: list of MatrixTables to join together
    :param str tmp_dir: path to temporary directory for intermediate results
    :param int chunk_size: number of MatrixTables to join per chunk

    :return: joined MatrixTable
    :rtype: MatrixTable
    """

    staging = [mt.localize_entries("__entries", "__cols") for mt in mts]
    stage = 0
    while len(staging) > 1:
        n_jobs = int(math.ceil(len(staging) / chunk_size))
        info(f"multi_way_union_mts: stage {stage}: {n_jobs} total jobs")
        next_stage = []
        for i in range(n_jobs):
            to_merge = staging[chunk_size * i:chunk_size * (i + 1)]
            info(
                f"multi_way_union_mts: stage {stage} / job {i}: merging {len(to_merge)} inputs"
            )

            merged = hl.Table.multi_way_zip_join(to_merge, "__entries",
                                                 "__cols")
            merged = merged.annotate(__entries=hl.flatten(
                hl.range(hl.len(merged.__entries)).map(lambda i: hl.coalesce(
                    merged.__entries[i].__entries,
                    hl.range(hl.len(merged.__cols[i].__cols)).map(
                        lambda j: hl.null(merged.__entries.__entries.dtype.
                                          element_type.element_type)),
                ))))
            merged = merged.annotate_globals(
                __cols=hl.flatten(merged.__cols.map(lambda x: x.__cols)))

            next_stage.append(
                merged.checkpoint(os.path.join(tmp_dir,
                                               f"stage_{stage}_job_{i}.ht"),
                                  overwrite=True))
        info(f"done stage {stage}")
        stage += 1
        staging.clear()
        staging.extend(next_stage)

    return (staging[0]._unlocalize_entries(
        "__entries", "__cols", list(mts[0].col_key)).unfilter_entries())
예제 #30
0
    def from_random_effects(cls, y, x, z,
                            p_path=None,
                            overwrite=False,
                            max_condition_number=1e-10,
                            complexity_bound=8192):
        r"""Initializes a model from :math:`y`, :math:`X`, and :math:`Z`.

        Examples
        --------
        >>> from hail.stats import LinearMixedModel
        >>> y = np.array([0.0, 1.0, 8.0, 9.0])
        >>> x = np.array([[1.0, 0.0],
        ...               [1.0, 2.0],
        ...               [1.0, 1.0],
        ...               [1.0, 4.0]])
        >>> z = np.array([[0.0, 0.0, 1.0],
        ...               [0.0, 1.0, 2.0],
        ...               [1.0, 2.0, 4.0],
        ...               [2.0, 4.0, 8.0]])
        >>> model, p = LinearMixedModel.from_random_effects(y, x, z)
        >>> model.fit()
        >>> model.h_sq
        0.38205307244271675

        Notes
        -----
        If :math:`n \leq m`, the returned model is full rank.

        If :math:`n > m`, the returned model is low rank. In this case only,
        eigenvalues less than or equal to `max_condition_number` times the top
        eigenvalue are dropped from :math:`S`, with the corresponding
        eigenvectors dropped from :math:`P`. This guards against precision
        loss on left eigenvectors computed via the right gramian :math:`Z^T Z`
        in :meth:`BlockMatrix.svd`.

        In either case, one can truncate to a rank :math:`r` model as follows.
        If `p` is an ndarray:

        >>> p_r = p[:r, :]     # doctest: +SKIP
        >>> s_r = model.s[:r]  # doctest: +SKIP
        >>> model_r = LinearMixedModel(p_r @ y, p_r @ x, s_r, y, x)  # doctest: +SKIP

        If `p` is a block matrix:

        >>> p[:r, :].write(p_r_path)          # doctest: +SKIP
        >>> p_r = BlockMatrix.read(p_r_path)  # doctest: +SKIP
        >>> s_r = model.s[:r]                 # doctest: +SKIP
        >>> model_r = LinearMixedModel(p_r @ y, p_r @ x, s_r, y, x, p_r_path)  # doctest: +SKIP

        This method applies no standardization to `z`.

        Warning
        -------
        If `z` is a block matrix, then ideally `z` should be the result of
        directly reading from disk (and possibly a transpose). This is most
        critical if :math:`n > m`, because in this case multiplication by `z`
        will result in all preceding transformations being repeated
        ``n / block_size`` times, as explained in :class:`.BlockMatrix`.

        At least one dimension must be less than or equal to 46300.
        See the warning in :meth:`.BlockMatrix.svd` for performance
        considerations.

        Parameters
        ----------
        y: :class:`ndarray`
            :math:`n` vector of observations :math:`y`.
        x: :class:`ndarray`
            :math:`n \times p` matrix of fixed effects :math:`X`.
        z: :class:`ndarray` or :class:`BlockMatrix`
            :math:`n \times m` matrix of random effects :math:`Z`.
        p_path: :obj:`str`, optional
            Path at which to write :math:`P` as a block matrix.
            Required if `z` is a block matrix.
        overwrite: :obj:`bool`
            If ``True``, overwrite an existing file at `p_path`.
        max_condition_number: :obj:`float`
            Maximum condition number. Must be greater than 1e-16.
        complexity_bound: :obj:`int`
            Complexity bound for :meth:`.BlockMatrix.svd` when `z` is a block
            matrix.

        Returns
        -------
        model: :class:`LinearMixedModel`
            Model constructed from :math:`y`, :math:`X`, and :math:`Z`.
        p: :class:`ndarray` or :class:`.BlockMatrix`
            Matrix :math:`P` whose rows are the eigenvectors of :math:`K`.
            The type is block matrix if `z` is a block matrix and
            :meth:`.BlockMatrix.svd` of `z` returns :math:`U` as a block matrix.
        """
        z_is_bm = isinstance(z, BlockMatrix)

        if z_is_bm and p_path is None:
            raise ValueError("from_random_effects: 'p_path' required when 'z'"
                             "is a block matrix.")

        if max_condition_number < 1e-16:
            raise ValueError("from_random_effects: 'max_condition_number' must "
                             f"be at least 1e-16, found {max_condition_number}")

        _check_dims(y, "y", 1)
        _check_dims(x, "x", 2)
        _check_dims(z, "z", 2)

        n, m = z.shape

        if y.shape[0] != n:
            raise ValueError("from_random_effects: 'y' and 'z' must have the "
                             "same number of rows")
        if x.shape[0] != n:
            raise ValueError("from_random_effects: 'x' and 'z' must have the "
                             "same number of rows")

        if z_is_bm:
            u, s0, _ = z.svd(complexity_bound=complexity_bound)
            p = u.T
            p_is_bm = isinstance(p, BlockMatrix)
        else:
            u, s0, _ = hl.linalg._svd(z, full_matrices=False)
            p = u.T
            p_is_bm = False

        s = s0 ** 2

        low_rank = n > m

        if low_rank:
            assert np.all(np.isfinite(s))
            r = np.searchsorted(-s, -max_condition_number * s[0])
            if r < m:
                info(f'from_random_effects: model rank reduced from {m} to {r} '
                     f'due to ill-condition.'
                     f'\n    Largest dropped eigenvalue was {s[r]}.')
            s = s[:r]
            p = p[:r, :]

        if p_path is not None:
            if p_is_bm:
                p.write(p_path, overwrite=overwrite)
                p = BlockMatrix.read(p_path)
            else:
                BlockMatrix.from_numpy(p).write(p_path, overwrite=overwrite)
        if p_is_bm:
            py, px = (p @ y).to_numpy(), (p @ x).to_numpy()
        else:
            py, px = p @ y, p @ x

        if low_rank:
            model = LinearMixedModel(py, px, s, y, x, p_path)
        else:
            model = LinearMixedModel(py, px, s, p_path=p_path)

        return model, p
예제 #31
0
파일: qc.py 프로젝트: troels/hail
def concordance(
        left,
        right,
        *,
        _localize_global_statistics=True
) -> Tuple[List[List[int]], Table, Table]:
    """Calculate call concordance with another dataset.

    .. include:: ../_templates/req_tstring.rst

    .. include:: ../_templates/req_tvariant.rst

    .. include:: ../_templates/req_biallelic.rst

    .. include:: ../_templates/req_unphased_diploid_gt.rst

    Examples
    --------

    Compute concordance between two datasets and output the global concordance
    statistics and two tables with concordance computed per column key and per
    row key:

    >>> global_conc, cols_conc, rows_conc = hl.concordance(dataset, dataset2)

    Notes
    -----

    This method computes the genotype call concordance (from the entry
    field **GT**) between two biallelic variant datasets.  It requires
    unique sample IDs and performs an inner join on samples (only
    samples in both datasets will be considered). In addition, all genotype
    calls must be **diploid** and **unphased**.

    It performs an ordered zip join of the variants.  That means the
    variants of each dataset are sorted, with duplicate variants
    appearing in some random relative order, and then zipped together.
    When a variant appears a different number of times between the two
    datasets, the dataset with the fewer number of instances is padded
    with "no data".  For example, if a variant is only in one dataset,
    then each genotype is treated as "no data" in the other.

    This method returns a tuple of three objects: a nested list of
    list of int with global concordance summary statistics, a table
    with concordance statistics per column key, and a table with
    concordance statistics per row key.

    **Using the global summary result**

    The global summary is a list of list of int (conceptually a 5 by 5 matrix),
    where the indices have special meaning:

    0. No Data (missing variant)
    1. No Call (missing genotype call)
    2. Hom Ref
    3. Heterozygous
    4. Hom Var

    The first index is the state in the left dataset and the second index is
    the state in the right dataset. Typical uses of the summary list are shown
    below.

    >>> summary, samples, variants = hl.concordance(dataset, dataset2)
    >>> left_homref_right_homvar = summary[2][4]
    >>> left_het_right_missing = summary[3][1]
    >>> left_het_right_something_else = sum(summary[3][:]) - summary[3][3]
    >>> total_concordant = summary[2][2] + summary[3][3] + summary[4][4]
    >>> total_discordant = sum([sum(s[2:]) for s in summary[2:]]) - total_concordant

    **Using the table results**

    Table 1: Concordance statistics by column

    This table contains the column key field of `left`, and the following fields:

        - `n_discordant` (:py:data:`.tint64`) -- Count of discordant calls (see below for
          full definition).
        - `concordance` (:class:`.tarray` of :class:`.tarray` of :py:data:`.tint64`) --
          Array of concordance per state on left and right, matching the structure of
          the global summary defined above.

    Table 2: Concordance statistics by row

    This table contains the row key fields of `left`, and the following fields:

        - `n_discordant` (:py:data:`.tfloat64`) -- Count of discordant calls (see below for
          full definition).
        - `concordance` (:class:`.tarray` of :class:`.tarray` of :py:data:`.tint64`) --
          Array of concordance per state on left and right, matching the structure of the
          global summary defined above.

    In these tables, the column **n_discordant** is provided as a convenience,
    because this is often one of the most useful concordance statistics. This
    value is the number of genotypes which were called (homozygous reference,
    heterozygous, or homozygous variant) in both datasets, but where the call
    did not match between the two.

    The column `concordance` matches the structure of the global summmary,
    which is detailed above. Once again, the first index into this array is the
    state on the left, and the second index is the state on the right. For
    example, ``concordance[1][4]`` is the number of "no call" genotypes on the
    left that were called homozygous variant on the right.

    Parameters
    ----------
    left : :class:`.MatrixTable`
        First dataset to compare.
    right : :class:`.MatrixTable`
        Second dataset to compare.

    Returns
    -------
    (list of list of int, :class:`.Table`, :class:`.Table`)
        The global concordance statistics, a table with concordance statistics
        per column key, and a table with concordance statistics per row key.

    """

    require_col_key_str(left, 'concordance, left')
    require_col_key_str(right, 'concordance, right')

    left_sample_counter = left.aggregate_cols(hl.agg.counter(left.col_key[0]))
    right_sample_counter = right.aggregate_cols(
        hl.agg.counter(right.col_key[0]))

    left_bad = [f'{k!r}: {v}' for k, v in left_sample_counter.items() if v > 1]
    right_bad = [
        f'{k!r}: {v}' for k, v in right_sample_counter.items() if v > 1
    ]
    if left_bad or right_bad:
        raise ValueError(f"Found duplicate sample IDs:\n"
                         f"  left:  {', '.join(left_bad)}\n"
                         f"  right: {', '.join(right_bad)}")

    included = set(left_sample_counter.keys()).intersection(
        set(right_sample_counter.keys()))

    info(
        f"concordance: including {len(included)} shared samples "
        f"({len(left_sample_counter)} total on left, {len(right_sample_counter)} total on right)"
    )

    left = require_biallelic(left, 'concordance, left')
    right = require_biallelic(right, 'concordance, right')

    lit = hl.literal(included, dtype=hl.tset(hl.tstr))
    left = left.filter_cols(lit.contains(left.col_key[0]))
    right = right.filter_cols(lit.contains(right.col_key[0]))

    left = left.select_entries('GT').select_rows().select_cols()
    right = right.select_entries('GT').select_rows().select_cols()

    joined = hl.experimental.full_outer_join_mt(left, right)

    def get_idx(struct):
        return hl.cond(hl.is_missing(struct), 0,
                       hl.coalesce(2 + struct.GT.n_alt_alleles(), 1))

    aggr = hl.agg.counter(
        get_idx(joined.left_entry) + 5 * get_idx(joined.right_entry))

    def concordance_array(counter):
        return hl.range(0, 5).map(
            lambda i: hl.range(0, 5).map(lambda j: counter.get(i + 5 * j, 0)))

    def n_discordant(counter):
        return hl.sum(
            hl.array(counter).filter(lambda tup: ~hl.literal(
                {i**2
                 for i in range(5)}).contains(tup[0])).map(lambda tup: tup[1]))

    glob = joined.aggregate_entries(concordance_array(aggr),
                                    _localize=_localize_global_statistics)
    if _localize_global_statistics:
        total_conc = [x[1:] for x in glob[1:]]
        on_diag = sum(total_conc[i][i] for i in range(len(total_conc)))
        total_obs = sum(sum(x) for x in total_conc)
        info(f"concordance: total concordance {on_diag/total_obs * 100:.2f}%")

    per_variant = joined.annotate_rows(concordance=aggr)
    per_variant = per_variant.annotate_rows(
        concordance=concordance_array(per_variant.concordance),
        n_discordant=n_discordant(per_variant.concordance))
    per_sample = joined.annotate_cols(concordance=aggr)
    per_sample = per_sample.annotate_cols(
        concordance=concordance_array(per_sample.concordance),
        n_discordant=n_discordant(per_sample.concordance))

    return glob, per_sample.cols(), per_variant.rows()
예제 #32
0
파일: pca.py 프로젝트: MariusDanner/hail
def _blanczos_pca(entry_expr,
                  k=10,
                  compute_loadings=False,
                  q_iterations=2,
                  oversampling_param=2,
                  block_size=128):
    r"""Run randomized principal component analysis approximation (PCA)
    on numeric columns derived from a matrix table.

    Implements the Blanczos algorithm found by Rokhlin, Szlam, and Tygert.

    Examples
    --------

    For a matrix table with variant rows, sample columns, and genotype entries,
    compute the top 2 PC sample scores and eigenvalues of the matrix of 0s and
    1s encoding missingness of genotype calls.

    >>> eigenvalues, scores, _ = hl._blanczos_pca(hl.int(hl.is_defined(dataset.GT)),
    ...                                 k=2)

    Warning
    -------
      This method does **not** automatically mean-center or normalize each column.
      If desired, such transformations should be incorporated in `entry_expr`.

      Hail will return an error if `entry_expr` evaluates to missing, nan, or
      infinity on any entry.

    Notes
    -----

    PCA is run on the columns of the numeric matrix obtained by evaluating
    `entry_expr` on each entry of the matrix table, or equivalently on the rows
    of the **transposed** numeric matrix :math:`M` referenced below.

    PCA computes the SVD

    .. math::

      M = USV^T

    where columns of :math:`U` are left singular vectors (orthonormal in
    :math:`\mathbb{R}^n`), columns of :math:`V` are right singular vectors
    (orthonormal in :math:`\mathbb{R}^m`), and :math:`S=\mathrm{diag}(s_1, s_2,
    \ldots)` with ordered singular values :math:`s_1 \ge s_2 \ge \cdots \ge 0`.
    Typically one computes only the first :math:`k` singular vectors and values,
    yielding the best rank :math:`k` approximation :math:`U_k S_k V_k^T` of
    :math:`M`; the truncations :math:`U_k`, :math:`S_k` and :math:`V_k` are
    :math:`n \times k`, :math:`k \times k` and :math:`m \times k`
    respectively.

    From the perspective of the rows of :math:`M` as samples (data points),
    :math:`V_k` contains the loadings for the first :math:`k` PCs while
    :math:`MV_k = U_k S_k` contains the first :math:`k` PC scores of each
    sample. The loadings represent a new basis of features while the scores
    represent the projected data on those features. The eigenvalues of the Gramian
    :math:`MM^T` are the squares of the singular values :math:`s_1^2, s_2^2,
    \ldots`, which represent the variances carried by the respective PCs. By
    default, Hail only computes the loadings if the ``loadings`` parameter is
    specified.

    Scores are stored in a :class:`.Table` with the column key of the matrix
    table as key and a field `scores` of type ``array<float64>`` containing
    the principal component scores.

    Loadings are stored in a :class:`.Table` with the row key of the matrix
    table as key and a field `loadings` of type ``array<float64>`` containing
    the principal component loadings.

    The eigenvalues are returned in descending order, with scores and loadings
    given the corresponding array order.

    Parameters
    ----------
    entry_expr : :class:`.Expression`
        Numeric expression for matrix entries.
    k : :obj:`int`
        Number of principal components.
    compute_loadings : :obj:`bool`
        If ``True``, compute row loadings.
    q_iterations : :obj:`int`
        Number of rounds of power iteration to amplify singular values.
    oversampling_param : :obj:`int`
        Amount of oversampling to use when approximating the singular values.
        Usually a value between `0 <= oversampling_param <= k`.

    Returns
    -------
    (:obj:`list` of :obj:`float`, :class:`.Table`, :class:`.Table`)
        List of eigenvalues, table with column scores, table with row loadings.
    """
    check_entry_indexed('mt_to_table_of_ndarray/entry_expr', entry_expr)
    mt = matrix_table_source('pca/entry_expr', entry_expr)

    A, ht = mt_to_table_of_ndarray(entry_expr,
                                   block_size,
                                   return_checkpointed_table_also=True)
    A = A.persist()

    # Set Parameters

    q = q_iterations
    L = k + oversampling_param
    n = A.take(1)[0].ndarray.shape[1]

    # Generate random matrix G
    G = hl.nd.zeros((n, L)).map(lambda n: hl.rand_norm(0, 1))

    def hailBlanczos(A, G, k, q):

        h_list = []
        G_i = hl.nd.qr(G)[0]

        for j in range(0, q):
            info(f"blanczos_pca: Beginning iteration {j + 1}/{q+1}")
            temp = A.annotate(H_i=A.ndarray @ G_i)
            temp = temp.annotate(G_i_intermediate=temp.ndarray.T @ temp.H_i)
            result = temp.aggregate(hl.struct(
                Hi_chunks=hl.agg.collect(temp.H_i),
                G_i=hl.agg.ndarray_sum(temp.G_i_intermediate)),
                                    _localize=False)._persist()
            localized_H_i = hl.nd.vstack(result.Hi_chunks)
            h_list.append(localized_H_i)
            G_i = hl.nd.qr(result.G_i)[0]

        info(f"blanczos_pca: Beginning iteration {q+ 1}/{q+1}")
        temp = A.annotate(H_i=A.ndarray @ G_i)
        result = temp.aggregate(hl.agg.collect(temp.H_i),
                                _localize=False)._persist()
        info("blanczos_pca: Iterations complete. Computing local QR")
        localized_H_i = hl.nd.vstack(result)
        h_list.append(localized_H_i)
        H = hl.nd.hstack(h_list)
        Q = hl.nd.qr(H)[0]._persist()
        A = A.annotate(part_size=A.ndarray.shape[0])
        A = A.annotate(rows_preceeding=hl.int32(hl.scan.sum(A.part_size)))
        A = A.annotate_globals(Qt=Q.T)
        T = A.annotate(ndarray=A.Qt[:, A.rows_preceeding:A.rows_preceeding +
                                    A.part_size] @ A.ndarray)
        arr_T = T.aggregate(hl.agg.ndarray_sum(T.ndarray), _localize=False)

        info("blanczos_pca: QR Complete. Computing local SVD")
        U, S, W = hl.nd.svd(arr_T, full_matrices=False)._persist()

        V = Q @ U

        truncV = V[:, :k]
        truncS = S[:k]
        truncW = W[:k, :]

        return truncV, truncS, truncW

    U, S, V = hailBlanczos(A, G, k, q)

    scores = V.transpose() * S
    eigens = hl.eval(S * S)
    info("blanczos_pca: SVD Complete. Computing conversion to PCs.")

    hail_array_scores = scores._data_array()
    cols_and_scores = hl.zip(
        A.index_globals().cols,
        hail_array_scores).map(lambda tup: tup[0].annotate(scores=tup[1]))
    st = hl.Table.parallelize(cols_and_scores, key=list(mt.col_key))

    lt = ht.select()
    lt = lt.annotate_globals(U=U)
    idx_name = '_tmp_pca_loading_index'
    lt = lt.add_index(idx_name)
    lt = lt.annotate(
        loadings=lt.U[lt[idx_name], :]._data_array()).select_globals()
    lt = lt.drop(lt[idx_name])

    if compute_loadings:
        return eigens, st, lt
    else:
        return eigens, st, None
예제 #33
0
    def from_merged_representation(mt,
                                   *,
                                   ref_block_fields=(),
                                   infer_ref_block_fields: bool = True):
        """Create a VariantDataset from a sparse MatrixTable containing variant and reference data."""

        if 'END' not in mt.entry:
            raise ValueError(
                "VariantDataset.from_merged_representation: expect field 'END' in matrix table entry"
            )

        if 'LA' not in mt.entry:
            raise ValueError(
                "VariantDataset.from_merged_representation: expect field 'LA' in matrix table entry"
            )

        if 'GT' not in mt.entry and 'LGT' not in mt.entry:
            raise ValueError(
                "VariantDataset.from_merged_representation: expect field 'LGT' or 'GT' in matrix table entry"
            )

        n_rows_to_use = 100
        info(
            f"inferring reference block fields from missingness patterns in first {n_rows_to_use} rows"
        )
        used_ref_block_fields = set(ref_block_fields)
        used_ref_block_fields.add('END')

        if infer_ref_block_fields:
            mt_head = mt.head(n_rows=n_rows_to_use)
            for k, any_present in zip(
                    list(mt_head.entry),
                    mt_head.aggregate_entries(
                        hl.agg.filter(
                            hl.is_defined(mt_head.END),
                            tuple(
                                hl.agg.any(hl.is_defined(mt_head[x]))
                                for x in mt_head.entry)))):
                if any_present:
                    used_ref_block_fields.add(k)

        gt_field = 'LGT' if 'LGT' in mt.entry else 'GT'

        # remove LGT/GT and LA fields, which are trivial for reference blocks and do not need to be represented
        if gt_field in used_ref_block_fields:
            used_ref_block_fields.remove(gt_field)
        if 'LA' in used_ref_block_fields:
            used_ref_block_fields.remove('LA')

        info("Including the following fields in reference block table:" +
             "".join(f"\n  {k!r}"
                     for k in mt.entry if k in used_ref_block_fields))

        rmt = mt.filter_entries(hl.case(
        ).when(hl.is_missing(mt.END), False).when(
            hl.is_defined(mt.END) & mt[gt_field].is_hom_ref(), True).or_error(
                hl.str('cannot create VDS from merged representation -'
                       ' found END field with non-reference genotype at ') +
                hl.str(mt.locus) + hl.str(' / ') + hl.str(mt.col_key[0])))
        rmt = rmt.select_entries(*(x for x in rmt.entry
                                   if x in used_ref_block_fields))
        rmt = rmt.filter_rows(hl.agg.count() > 0)

        # drop other alleles
        rmt = rmt.key_rows_by(rmt.locus)
        rmt = rmt.select_rows(ref_allele=rmt.alleles[0][0])

        vmt = mt.filter_entries(hl.is_missing(mt.END))
        vmt = vmt.filter_rows(hl.agg.count() > 0)

        return VariantDataset(rmt, vmt)
예제 #34
0
def run_combiner(sample_paths: List[str],
                 out_file: str,
                 tmp_path: str,
                 intervals: Optional[List[hl.utils.Interval]] = None,
                 header: Optional[str] = None,
                 sample_names: Optional[List[str]] = None,
                 branch_factor: int = CombinerConfig.default_branch_factor,
                 batch_size: int = CombinerConfig.default_batch_size,
                 target_records: int = CombinerConfig.default_target_records,
                 overwrite: bool = False,
                 reference_genome: str = 'default',
                 contig_recoding: Optional[Dict[str, str]] = None,
                 key_by_locus_and_alleles: bool = False):
    """Run the Hail VCF combiner, performing a hierarchical merge to create a combined sparse matrix table.

    Parameters
    ----------
    sample_paths : :obj:`list` of :obj:`str`
        Paths to individual GVCFs.
    out_file : :obj:`str`
        Path to final combined matrix table.
    tmp_path : :obj:`str`
        Path for intermediate output.
    intervals : list of :class:`.Interval` or None
        Partitioning with which to import GVCFs in first phase of combiner.
    header : :obj:`str` or None
        External header file to use as GVCF header for all inputs. If defined, `sample_names` must be defined as well.
    sample_names: list of :obj:`str` or None
        Sample names, to be used with `header`.
    branch_factor : :obj:`int`
        Combiner branch factor.
    batch_size : :obj:`int`
        Combiner batch size.
    target_records : :obj:`int`
        Target records per partition in each combiner phase after the first.
    overwrite : :obj:`bool`
        Overwrite output file, if it exists.
    reference_genome : :obj:`str`
        Reference genome for GVCF import.
    contig_recoding: :obj:`dict` of (:obj:`str`, :obj:`str`), optional
        Mapping from contig name in gVCFs to contig name the reference
        genome.  All contigs must be present in the
        `reference_genome`, so this is useful for mapping
        differently-formatted data onto known references.
    key_by_locus_and_alleles : :obj:`bool`
        Key by both locus and alleles in the final output.

    Returns
    -------
    None

    """
    tmp_path += f'/combiner-temporary/{uuid.uuid4()}/'
    if header is not None:
        assert sample_names is not None
        assert len(sample_names) == len(sample_paths)

    # FIXME: this should be hl.default_reference().even_intervals_contig_boundary
    intervals = intervals or default_exome_intervals(reference_genome)

    config = CombinerConfig(branch_factor=branch_factor,
                            batch_size=batch_size,
                            target_records=target_records)
    plan = config.plan(len(sample_paths))

    files_to_merge = sample_paths
    n_phases = len(plan.phases)
    total_ops = len(files_to_merge) * n_phases
    total_work_done = 0
    for phase_i, phase in enumerate(plan.phases):
        phase_i += 1  # used for info messages, 1-indexed for readability

        n_jobs = len(phase.jobs)
        merge_str = 'input GVCFs' if phase_i == 1 else 'intermediate sparse matrix tables'
        job_str = hl.utils.misc.plural('job', n_jobs)
        info(
            f"Starting phase {phase_i}/{n_phases}, merging {len(files_to_merge)} {merge_str} in {n_jobs} {job_str}."
        )

        if phase_i > 1:
            intervals = calculate_new_intervals(
                hl.read_matrix_table(files_to_merge[0]).rows(),
                config.target_records,
                reference_genome=reference_genome)

        new_files_to_merge = []

        for job_i, job in enumerate(phase.jobs):
            job_i += 1  # used for info messages, 1-indexed for readability

            n_merges = len(job.merges)
            merge_str = hl.utils.misc.plural('file', n_merges)
            pct_total = 100 * job.input_total_size / total_ops
            info(
                f"Starting phase {phase_i}/{n_phases}, job {job_i}/{len(phase.jobs)} to create {n_merges} merged {merge_str}, corresponding to ~{pct_total:.1f}% of total I/O."
            )
            merge_mts: List[MatrixTable] = []
            for merge in job.merges:
                inputs = [files_to_merge[i] for i in merge.inputs]

                if phase_i == 1:
                    mts = [
                        transform_gvcf(vcf) for vcf in hl.import_gvcfs(
                            inputs,
                            intervals,
                            array_elements_required=False,
                            _external_header=header,
                            _external_sample_ids=[
                                sample_names[i] for i in merge.inputs
                            ] if header is not None else None,
                            reference_genome=reference_genome,
                            contig_recoding=contig_recoding)
                    ]
                else:
                    mts = [
                        hl.read_matrix_table(path, _intervals=intervals)
                        for path in inputs
                    ]

                merge_mts.append(combine_gvcfs(mts))

            if phase_i == n_phases:  # final merge!
                assert n_jobs == 1
                assert len(merge_mts) == 1
                [final_mt] = merge_mts

                if key_by_locus_and_alleles:
                    final_mt = MatrixTable(
                        MatrixKeyRowsBy(final_mt._mir, ['locus', 'alleles'],
                                        is_sorted=True))
                final_mt.write(out_file, overwrite=overwrite)
                new_files_to_merge = [out_file]
                info(
                    f"Finished phase {phase_i}/{n_phases}, job {job_i}/{len(phase.jobs)}, 100% of total I/O finished."
                )
                break

            tmp = f'{tmp_path}_phase{phase_i}_job{job_i}/'
            hl.experimental.write_matrix_tables(merge_mts, tmp, overwrite=True)
            pad = len(str(len(merge_mts)))
            new_files_to_merge.extend(tmp + str(n).zfill(pad) + '.mt'
                                      for n in range(len(merge_mts)))
            total_work_done += job.input_total_size
            info(
                f"Finished {phase_i}/{n_phases}, job {job_i}/{len(phase.jobs)}, {100 * total_work_done / total_ops:.1f}% of total I/O finished."
            )

        info(f"Finished phase {phase_i}/{n_phases}.")

        files_to_merge = new_files_to_merge

    assert files_to_merge == [out_file]

    info("Finished!")
예제 #35
0
파일: import_gtf.py 프로젝트: jigold/hail
def get_gene_intervals(gene_symbols=None, gene_ids=None, transcript_ids=None,
                       verbose=True, reference_genome=None, gtf_file=None):
    """Get intervals of genes or transcripts.

    Get the boundaries of genes or transcripts from a GTF file, for quick filtering of a Table or MatrixTable.

    On Google Cloud platform:
    Gencode v19 (GRCh37) GTF available at: gs://hail-common/references/gencode/gencode.v19.annotation.gtf.bgz
    Gencode v29 (GRCh38) GTF available at: gs://hail-common/references/gencode/gencode.v29.annotation.gtf.bgz

    Example
    -------
    >>> hl.filter_intervals(ht, get_gene_intervals(gene_symbols=['PCSK9'], reference_genome='GRCh37'))  # doctest: +SKIP

    Parameters
    ----------

    gene_symbols : :obj:`list` of :obj:`str`, optional
       Gene symbols (e.g. PCSK9).
    gene_ids : :obj:`list` of :obj:`str`, optional
       Gene IDs (e.g. ENSG00000223972).
    transcript_ids : :obj:`list` of :obj:`str`, optional
       Transcript IDs (e.g. ENSG00000223972).
    verbose : :obj:`bool`
       If ``True``, print which genes and transcripts were matched in the GTF file.
    reference_genome : :obj:`str` or :class:`.ReferenceGenome`, optional
       Reference genome to use (passed along to import_gtf).
    gtf_file : :obj:`str`
       GTF file to load. If none is provided, but `reference_genome` is one of
       `GRCh37` or `GRCh38`, a default will be used (on Google Cloud Platform).

    Returns
    -------
    :obj:`list` of :class:`.Interval`
    """
    GTFS = {
        'GRCh37': 'gs://hail-common/references/gencode/gencode.v19.annotation.gtf.bgz',
        'GRCh38': 'gs://hail-common/references/gencode/gencode.v29.annotation.gtf.bgz',
    }
    if reference_genome is None:
        reference_genome = hl.default_reference().name
    if gtf_file is None:
        gtf_file = GTFS.get(reference_genome)
        if gtf_file is None:
            raise ValueError('get_gene_intervals requires a GTF file, or the reference genome be one of GRCh37 or GRCh38 (when on Google Cloud Platform)')
    if gene_symbols is None and gene_ids is None and transcript_ids is None:
        raise ValueError('get_gene_intervals requires at least one of gene_symbols, gene_ids, or transcript_ids')
    ht = hl.experimental.import_gtf(gtf_file, reference_genome=reference_genome,
                                    skip_invalid_contigs=True, min_partitions=12)
    ht = ht.annotate(gene_id=ht.gene_id.split(f'\\.')[0],
                     transcript_id=ht.transcript_id.split('\\.')[0])
    criteria = []
    if gene_symbols:
        criteria.append(hl.any(lambda y: (ht.feature == 'gene') & (ht.gene_name == y), gene_symbols))
    if gene_ids:
        criteria.append(hl.any(lambda y: (ht.feature == 'gene') & (ht.gene_id == y.split('\\.')[0]), gene_ids))
    if transcript_ids:
        criteria.append(hl.any(lambda y: (ht.feature == 'transcript') & (ht.transcript_id == y.split('\\.')[0]), transcript_ids))

    ht = ht.filter(functools.reduce(operator.ior, criteria))
    gene_info = ht.aggregate(hl.agg.collect((ht.feature, ht.gene_name, ht.gene_id, ht.transcript_id, ht.interval)))
    if verbose:
        info(f'get_gene_intervals found {len(gene_info)} entries:\n' +
             "\n".join(map(lambda x: f'{x[0]}: {x[1]} ({x[2] if x[0] == "gene" else x[3]})', gene_info)))
    intervals = list(map(lambda x: x[-1], gene_info))
    return intervals
예제 #36
0
    def from_random_effects(cls, y, x, z,
                            p_path=None,
                            overwrite=False,
                            max_condition_number=1e-10,
                            complexity_bound=8192):
        r"""Initializes a model from :math:`y`, :math:`X`, and :math:`Z`.

        Examples
        --------
        >>> from hail.stats import LinearMixedModel
        >>> y = np.array([0.0, 1.0, 8.0, 9.0])
        >>> x = np.array([[1.0, 0.0],
        ...               [1.0, 2.0],
        ...               [1.0, 1.0],
        ...               [1.0, 4.0]])
        >>> z = np.array([[0.0, 0.0, 1.0],
        ...               [0.0, 1.0, 2.0],
        ...               [1.0, 2.0, 4.0],
        ...               [2.0, 4.0, 8.0]])
        >>> model, p = LinearMixedModel.from_random_effects(y, x, z)
        >>> model.fit()
        >>> model.h_sq
        0.38205307244271675

        Notes
        -----
        If :math:`n \leq m`, the returned model is full rank.

        If :math:`n > m`, the returned model is low rank. In this case only,
        eigenvalues less than or equal to `max_condition_number` times the top
        eigenvalue are dropped from :math:`S`, with the corresponding
        eigenvectors dropped from :math:`P`. This guards against precision
        loss on left eigenvectors computed via the right gramian :math:`Z^T Z`
        in :meth:`BlockMatrix.svd`.

        In either case, one can truncate to a rank :math:`r` model as follows.
        If `p` is an ndarray:

        >>> p_r = p[:r, :]     # doctest: +SKIP
        >>> s_r = model.s[:r]  # doctest: +SKIP
        >>> model_r = LinearMixedModel(p_r @ y, p_r @ x, s_r, y, x)  # doctest: +SKIP

        If `p` is a block matrix:

        >>> p[:r, :].write(p_r_path)          # doctest: +SKIP
        >>> p_r = BlockMatrix.read(p_r_path)  # doctest: +SKIP
        >>> s_r = model.s[:r]                 # doctest: +SKIP
        >>> model_r = LinearMixedModel(p_r @ y, p_r @ x, s_r, y, x, p_r_path)  # doctest: +SKIP

        This method applies no standardization to `z`.

        Warning
        -------
        If `z` is a block matrix, then ideally `z` should be the result of
        directly reading from disk (and possibly a transpose). This is most
        critical if :math:`n > m`, because in this case multiplication by `z`
        will result in all preceding transformations being repeated
        ``n / block_size`` times, as explained in :class:`.BlockMatrix`.

        At least one dimension must be less than or equal to 46300.
        See the warning in :meth:`.BlockMatrix.svd` for performance
        considerations.

        Parameters
        ----------
        y: :class:`ndarray`
            :math:`n` vector of observations :math:`y`.
        x: :class:`ndarray`
            :math:`n \times p` matrix of fixed effects :math:`X`.
        z: :class:`ndarray` or :class:`BlockMatrix`
            :math:`n \times m` matrix of random effects :math:`Z`.
        p_path: :obj:`str`, optional
            Path at which to write :math:`P` as a block matrix.
            Required if `z` is a block matrix.
        overwrite: :obj:`bool`
            If ``True``, overwrite an existing file at `p_path`.
        max_condition_number: :obj:`float`
            Maximum condition number. Must be greater than 1e-16.
        complexity_bound: :obj:`int`
            Complexity bound for :meth:`.BlockMatrix.svd` when `z` is a block
            matrix.

        Returns
        -------
        model: :class:`LinearMixedModel`
            Model constructed from :math:`y`, :math:`X`, and :math:`Z`.
        p: :class:`ndarray` or :class:`.BlockMatrix`
            Matrix :math:`P` whose rows are the eigenvectors of :math:`K`.
            The type is block matrix if `z` is a block matrix and
            :meth:`.BlockMatrix.svd` of `z` returns :math:`U` as a block matrix.
        """
        z_is_bm = isinstance(z, BlockMatrix)

        if z_is_bm and p_path is None:
            raise ValueError("from_random_effects: 'p_path' required when 'z'"
                             "is a block matrix.")

        if max_condition_number < 1e-16:
            raise ValueError("from_random_effects: 'max_condition_number' must "
                             f"be at least 1e-16, found {max_condition_number}")

        _check_dims(y, "y", 1)
        _check_dims(x, "x", 2)
        _check_dims(z, "z", 2)

        n, m = z.shape

        if y.shape[0] != n:
            raise ValueError("from_random_effects: 'y' and 'z' must have the "
                             "same number of rows")
        if x.shape[0] != n:
            raise ValueError("from_random_effects: 'x' and 'z' must have the "
                             "same number of rows")

        if z_is_bm:
            u, s0, _ = z.svd(complexity_bound=complexity_bound)
            p = u.T
            p_is_bm = isinstance(p, BlockMatrix)
        else:
            u, s0, _ = hl.linalg._svd(z, full_matrices=False)
            p = u.T
            p_is_bm = False

        s = s0 ** 2

        low_rank = n > m

        if low_rank:
            assert np.all(np.isfinite(s))
            r = np.searchsorted(-s, -max_condition_number * s[0])
            if r < m:
                info(f'from_random_effects: model rank reduced from {m} to {r} '
                     f'due to ill-condition.'
                     f'\n    Largest dropped eigenvalue was {s[r]}.')
            s = s[:r]
            p = p[:r, :]

        if p_path is not None:
            if p_is_bm:
                p.write(p_path, overwrite=overwrite)
                p = BlockMatrix.read(p_path)
            else:
                BlockMatrix.from_numpy(p).write(p_path, overwrite=overwrite)
        if p_is_bm:
            py, px = (p @ y.reshape(n, 1)).to_numpy().flatten(), (p @ x).to_numpy()
        else:
            py, px = p @ y, p @ x

        if low_rank:
            model = LinearMixedModel(py, px, s, y, x, p_path)
        else:
            model = LinearMixedModel(py, px, s, p_path=p_path)

        return model, p
예제 #37
0
    def from_mixed_effects(cls, y, x, z, max_condition_number=1e-10):
        r"""Initializes a model from :math:`y`, :math:`X`, and :math:`Z`.

        Examples
        --------
        >>> from hail.stats import LinearMixedModel
        >>> y = np.array([0.0, 1.0, 8.0, 9.0])
        >>> x = np.array([[1.0, 0.0],
        ...               [1.0, 2.0],
        ...               [1.0, 1.0],
        ...               [1.0, 4.0]])
        >>> z = np.array([[0.0, 0.0, 1.0],
        ...               [0.0, 1.0, 2.0],
        ...               [1.0, 2.0, 4.0],
        ...               [2.0, 4.0, 8.0]])
        >>> model, p = LinearMixedModel.from_mixed_effects(y, x, z)
        >>> model.fit()
        >>> model.h_sq
        0.38205307244271675

        Notes
        -----
        If :math:`n \leq m`, the returned model is full rank.

        If :math:`n < m`, the returned model is low rank. In this case only,
        eigenvalues less than or equal to `max_condition_number` times the top
        eigenvalue are dropped from :math:`S`, with the corresponding
        eigenvectors dropped from :math:`P`. This guards against precision
        loss on left eigenvectors computed via the right gramian :math:`Z^T Z`
        in :meth:`BlockMatrix.svd`.

        In either case, one can truncate to a rank :math:`r` model as follows:

        >>> s_r = model.s[:r]  # doctest: +SKIP
        >>> p_r = p[:r, :]  # doctest: +SKIP
        >>> model_r = LinearMixedModel(p_r @ y, p_r @ x, s_r, y, x)  # doctest: +SKIP

        No standardization is applied to `z`.

        Warning
        -------
        If `z` is a block matrix, then ideally `z` should be the result of
        directly reading from disk (and possibly a transpose). This is most
        critical if :math:`n > m`, because in this case multiplication by `z`
        will result in all preceding transformations being repeated
        ``n / block_size`` times, as explained in :class:`BlockMatrix`.

        Parameters
        ----------
        y: :class:`ndarray`
            :math:`n` vector of observations :math:`y`.
        x: :class:`ndarray`
            :math:`n \times p` matrix of fixed effects :math:`X`.
        z: :class:`ndarray` or :class:`BlockMatrix`
            :math:`n \times m` matrix of random effects :math:`Z`.
        max_condition_number: :obj:`float`
            Maximum condition number. Must be greater than 1e-16.

        Returns
        -------
        model: :class:`LinearMixedModel`
            Model constructed from :math:`y`, :math:`X`, and :math:`Z`.
        p: :class:`ndarray`
            Matrix :math:`P` whose rows are the eigenvectors of :math:`K`.
        """
        if max_condition_number < 1e-16:
            raise ValueError(
                "from_random_effects: 'max_condition_number' must "
                f"be at least 1e-16, found {max_condition_number}")

        _check_dims(y, "y", 1)
        _check_dims(x, "x", 2)
        _check_dims(z, "z", 2)

        n, m = z.shape

        if y.shape[0] != n:
            raise ValueError("from_mixed_effects: 'y' and 'z' must have the "
                             "same number of rows")
        if x.shape[0] != n:
            raise ValueError("from_mixed_effects: 'x' and 'z' must have the "
                             "same number of rows")

        if isinstance(z, np.ndarray):
            u, s0, _ = hl.linalg._svd(z, full_matrices=False)
            p = u.T
            py, px = p @ y, p @ x
        else:
            u, s0, _ = z.svd()
            p = u.T
            py, px = (p @ y).to_numpy(), (p @ x).to_numpy()

        s = s0**2

        full_rank = n <= m
        if full_rank:
            model = LinearMixedModel(py, px, s)
        else:
            assert np.all(np.isfinite(s))
            r = np.searchsorted(-s, -max_condition_number * s[0])
            if r < m:
                info(f'from_mixed_effects: model rank reduced from {m} to {r} '
                     f'due to ill-condition.'
                     f'\n    Largest dropped eigenvalue was {s[r]}.')
            s = s[:r]
            p = p[:r, :]
            model = LinearMixedModel(py, px, s, y, x)
        return model, p