def test_load_top_associations_by_top_threshold(self): """Test if top associations by thresholds """ top_associations, thresholds = hdf5.get_top_associations(self.hdf5_file, 5, maf=0, top_or_threshold='threshold') assert isinstance(top_associations, np.core.records.recarray) assert len(top_associations) == 14 for assoc in top_associations: assert assoc['score'] >= 5.0 top_associations_by_e, thresholds = hdf5.get_top_associations(self.hdf5_file, 1e-5, maf=0, top_or_threshold='threshold') assert isinstance(top_associations, np.core.records.recarray) assert len(top_associations) == len(top_associations_by_e) for i, assoc in enumerate(top_associations_by_e): assert assoc.tolist() == top_associations[i].tolist()
def index_study(study_id, perm_threshold=None): study = Study.objects.get(pk=study_id) """ used to index a study in elasticseach """ hdf5_file = os.path.join(settings.HDF5_FILE_PATH, 'gwas_results', '%s.hdf5' % study.pk) top_associations, thresholds = hdf5.get_top_associations( hdf5_file, val=1e-4, top_or_threshold='threshold', maf=0) logger.info('Retrieved top associations from GWAS %s' % study_id) if perm_threshold: thresholds['permutation_threshold'] = perm_threshold indexed_assoc, failed_assoc = elastic.index_associations( study, top_associations, thresholds) if failed_assoc > 0: logger.error( 'Following associations failed to index for "%s" in elasticsearch' % (failed_assoc, indexed_assoc + failed_assoc, study_id)) elif indexed_assoc == 0: logger.warn( 'No associations found that match the threshold. Skipping "%s" in elasticsearch' % study_id) else: logger.info( 'Successfully indexed all %s assocations for "%s" in elasticsearch.' % (indexed_assoc, study_id)) return (indexed_assoc, failed_assoc), study_id
def assocations_from_hdf5(self, request, pk): """ Retrieve associations from the HDF5 file of the study. Must provide 'filter_type' (which can be = 'top', to only retrieve the top N associations, or 'threshold', to retrieve all associations above the threshold) and 'filter' (which is either the threshold or the number of desired associations) params in url. """ filter_type = request.query_params.get('filter_type', 'threshold') if filter_type not in ('threshold', 'top'): raise ValueError('filter_type must be either "threshold" or "top"') threshold_or_top = float(request.query_params.get('filter', 1)) if filter_type == 'top': threshold_or_top = int(threshold_or_top) association_file = os.path.join(settings.HDF5_FILE_PATH, '%s.hdf5' % pk) top_associations, thresholds = get_top_associations( association_file, maf=0, val=threshold_or_top, top_or_threshold=filter_type) output = {} prev_idx = 0 for chrom in range(1, 6): chr_idx = top_associations['chr'].searchsorted(str(chrom + 1)) output['chr%s' % chrom] = { 'scores': top_associations['score'][prev_idx:chr_idx], 'positions': top_associations['position'][prev_idx:chr_idx], 'mafs': top_associations['maf'][prev_idx:chr_idx] } prev_idx = chr_idx for key, value in thresholds.items(): value = int(value) if key == 'total_associations' else float(value) thresholds[key] = value output['thresholds'] = thresholds return Response(output, status=status.HTTP_200_OK)
def test_load_top_associations_by_top_hits_and_maf(self): """Test if top associations by number of hits cann be retrieved""" top_hit_num = 15 top_hits = [('1', 6369772, 5.559458119903501, 0.1386861313868613, 19, 0.360335870170728, 0.0761941875889666), ('2', 18351161, 5.221548337450959, 0.08029197080291971, 11, 0.328720498341187, 0.0747141063333232), ('3', 18057816, 4.795206143400829, 0.2116788321167883, 29, -0.336795159960789, 0.0737295910747224), ('4', 429928, 6.555416448260276, 0.4233576642335766, 58, 0.368255762771892, 0.0711756042811744), ('5', 18577788, 6.219812361173065, 0.15328467153284672, 21, -0.327934944673749, 0.0833854459419328)] top_associations, thresholds = hdf5.get_top_associations( self.hdf5_file, top_hit_num, top_or_threshold='top') assert thresholds['bonferroni_threshold01'] == 7.294197188903931 assert thresholds['bonferroni_threshold05'] == 6.5952271845679125 assert thresholds['bh_threshold'] == 6.6150447667600778 assert thresholds['total_associations'] == 196878 assert len(top_associations) == top_hit_num * 5 assert np.count_nonzero(top_associations['maf'] < 0.05) == 0 self._check_return_array(top_associations) for i in range(0, 5): assert top_associations[i * top_hit_num].tolist() == top_hits[i]
def test_regroup_top_assocations(self): top_associations, thresholds = hdf5.get_top_associations( self.hdf5_file, 5, maf=0, top_or_threshold='threshold') top_associations = hdf5.regroup_associations(top_associations) top_associations[0].tolist() == ('4', 429928, 6.55541645, 0.42335766, 58) top_associations[-1].tolist() == ('5', 18606578, 5.07844918, 0.47445255, 65)
def index_study(study_id, perm_threshold=None): study = Study.objects.get(pk=study_id) """ used to index a study in elasticseach """ hdf5_file = os.path.join(settings.HDF5_FILE_PATH,'%s.hdf5' % study.pk) top_associations, thresholds = hdf5.get_top_associations(hdf5_file, val=1e-4, top_or_threshold='threshold',maf=0) if perm_threshold: thresholds['permutation_threshold'] = perm_threshold return elastic.index_associations(study, top_associations, thresholds)
def test_load_top_associations_by_top_hits_and_maf(self): top_hit_num = 15 """Test if top associations by number of hits cann be retrieved""" top_hits = [ ('1', 6369772, 5.559458119903501, 0.1386861313868613, 19), ('2', 18351161, 5.221548337450959, 0.08029197080291971, 11), ('3', 18057816, 4.795206143400829, 0.2116788321167883, 29), ('4', 429928, 6.555416448260276, 0.4233576642335766, 58), ('5', 18577788, 6.219812361173065, 0.15328467153284672, 21) ] top_associations, thresholds = hdf5.get_top_associations( self.hdf5_file, top_hit_num, top_or_threshold='top') assert thresholds['bonferroni_threshold01'] == 7.294197188903931 assert thresholds['bonferroni_threshold05'] == 6.5952271845679125 assert thresholds['bh_threshold'] == 6.6150447667600778 assert thresholds['total_associations'] == 196878 assert len(top_associations) == top_hit_num * 5 assert np.count_nonzero(top_associations['maf'] < 0.05) == 0 self._check_return_array(top_associations) for i in range(0, 5): assert top_associations[i * top_hit_num].tolist() == top_hits[i]
def test_load_top_associations_by_top_threshold_and_maf(self): """Test if top associations by thresholds """ top_associations, thresholds = hdf5.get_top_associations( self.hdf5_file, 1e-5, maf=0.1, top_or_threshold='threshold') assert len(top_associations) == 13 assert np.count_nonzero(top_associations['maf'] < 0.1) == 0