def _get_sample_gts(gts, sample_i, sample_j, indi_cache): if sample_i in indi_cache: indi1, is_missing_1 = indi_cache[sample_i] else: indi1 = gts[:, sample_i] is_missing_1 = is_missing(indi1) indi_cache[sample_i] = indi1, is_missing_1 if sample_j in indi_cache: indi2, is_missing_2 = indi_cache[sample_j] else: indi2 = gts[:, sample_j] is_missing_2 = is_missing(indi2) indi_cache[sample_j] = indi2, is_missing_2 is_called = numpy.logical_not(numpy.logical_or(is_missing_1, is_missing_2)) indi1 = indi1[is_called] indi2 = indi2[is_called] assert issubclass(indi1.dtype.type, numpy.integer) assert issubclass(indi2.dtype.type, numpy.integer) return indi1, indi2
def _snp_has_enough_data(variations, max_missing_rate_in_ref_snp=0.1): gts = variations[GT_FIELD] missing_rate = numpy.sum(is_missing(gts, axis=2), axis=1) / gts.shape[1] return missing_rate < max_missing_rate_in_ref_snp