def test_double_quantitative_one_wide(self): """Can it render two quant dimensions, when one requires binning?""" values = [(0, 1), (2, 3), (6, 59999), (6, 60000)] quant_distribution = self.get_distribution(values) dataset = self.generate_messages_for_multi_distribution( ('shared_count', 'replied_to_count'), quant_distribution) binned_distribution = { (0, 0): quant_distribution[values[0]], (2, 0): quant_distribution[values[1]], (6, 59995): quant_distribution[values[2]] + quant_distribution[values[3]] } d1 = registry.get_dimension('shares') d2 = registry.get_dimension('replies') datatable = models.DataTable(d1, d2) result = datatable.render(dataset.message_set.all(), desired_secondary_bins=5) self.assertMultiDistributionsEqual(result, binned_distribution, ('shares', 'replies'), measure_key='value')
def test_render_single_quantitative_wide(self): """ Can produce a datatable with only a single quantitative dimension. The distribution is very wide and binning must be used. """ values = [0, 2, 3, 4, 60000] quant_distribution = self.get_distribution(values) dataset = self.generate_messages_for_distribution( field_name='shared_count', distribution=quant_distribution, ) binned_distribution = { 0: sum(quant_distribution[value] for value in values[:4]), 60000: quant_distribution[values[4]], } dimension = registry.get_dimension('shares') datatable = models.DataTable(dimension) result = datatable.render(dataset.message_set.all(), desired_primary_bins=5) self.assertDistributionsEqual(result, binned_distribution, level_key='shares', measure_key='value')
def doDistributionTest(self, dimension_key, dataset, distribution, **kwargs): dimension = registry.get_dimension(dimension_key) # Calculate the categorical distribution over the field name datatable = models.DataTable(dimension) result = datatable.render(dataset.message_set.all(), **kwargs) self.assertDistributionsEqual(result, distribution, level_key=dimension_key, measure_key='value')
def post(self, request, format=None): add_history(self.request.user, 'data-table', request.data) input = serializers.DataTableSerializer(data=request.data) if input.is_valid(): data = input.validated_data dataset = data['dataset'] dimensions = data['dimensions'] filters = data.get('filters', []) exclude = data.get('exclude', []) search_key = data.get('search_key') mode = data.get('mode') groups = data.get('groups', []) if len(groups) == 0: groups = None page_size = 100 page = None if data.get('page_size'): page_size = data.get('page_size') page_size = max(1, int(data.get('page_size'))) if data.get('page'): page = max(1, int(data.get('page'))) if type(filters) == types.ListType and len(filters) == 0 and \ type(exclude) == types.ListType and len(exclude) == 0 and len(dimensions) == 1 and dimensions[0].is_categorical(): result = dataset.get_precalc_distribution(dimension=dimensions[0], search_key=search_key, page=page, page_size=page_size, mode=mode) else: datatable = datatable_models.DataTable(*dimensions) if mode is not None: datatable.set_mode(mode) result = datatable.generate(dataset, filters, exclude, page_size, page, search_key, groups) # Just add the result key response_data = data response_data['result'] = result output = serializers.DataTableSerializer(response_data) return Response(output.data, status=status.HTTP_200_OK) return Response(input.errors, status=status.HTTP_400_BAD_REQUEST)
def test_double_quantitative_narrow(self): """Can it render two quantitative dimensions when binning is not needed.""" values = [(0, 1), (2, 3), (3, 2), (4, 5), (6, 7)] quant_distribution = self.get_distribution(values) dataset = self.generate_messages_for_multi_distribution( ('shared_count', 'replied_to_count'), quant_distribution) d1 = registry.get_dimension('shares') d2 = registry.get_dimension('replies') datatable = models.DataTable(d1, d2) result = datatable.render(dataset.message_set.all()) self.assertMultiDistributionsEqual(result, quant_distribution, ('shares', 'replies'), measure_key='value')
def test_render_single_categorical(self): """Can produce a datatable with a single categorical dimension.""" values = [True, False] bool_distribution = self.get_distribution(values) dataset = self.generate_messages_for_distribution( field_name='contains_url', distribution=bool_distribution, ) dimension = registry.get_dimension('contains_url') datatable = models.DataTable(dimension) result = datatable.render(dataset.message_set.all()) self.assertDistributionsEqual(result, bool_distribution, level_key='contains_url', measure_key='value')
def test_render_double_categorical(self): """Can produce a datatable with a two categorical dimensions.""" field_names = ('contains_url', 'contains_mention') values = [(True, True), (True, False), (False, True), (False, False)] bi_bool_distribution = self.get_distribution(values) dataset = self.generate_messages_for_multi_distribution( field_names=field_names, distribution=bi_bool_distribution, ) d1 = registry.get_dimension(field_names[0]) d2 = registry.get_dimension(field_names[1]) datatable = models.DataTable(d1, d2) result = datatable.render(dataset.message_set.all()) self.assertMultiDistributionsEqual(result, bi_bool_distribution, field_names, measure_key='value')
def test_render_two_related_categorical(self): """Can produce a datatable with two related categorical dimensions.""" # Create some language labels language_ids = self.create_test_languages() dataset = self.create_authors_with_values( 'username', ['username_%d' % d for d in xrange(5)]) author_ids = dataset.person_set.values_list('id', flat=True).distinct() # create language/person pairs value_pairs = [] for lang in language_ids: for author in author_ids: # skip cases where both are even, just so's there's gaps if lang % 2 == 0 and author % 2 == 0: continue value_pairs.append((lang, author)) # Distribute some messages id_distribution = self.get_distribution(value_pairs) self.generate_messages_for_multi_distribution( ('language_id', 'sender_id'), id_distribution, dataset=dataset) # Get the actual expected distribution value_distribution = self.convert_id_distribution_to_related( id_distribution, (corpus_models.Language, corpus_models.Person), ('name', 'username')) d1 = registry.get_dimension('language') d2 = registry.get_dimension('sender') datatable = models.DataTable(d1, d2) result = datatable.render(dataset.message_set.all()) self.assertMultiDistributionsEqual(result, value_distribution, ('language', 'sender'), measure_key='value')
def test_render_single_quantitative_narrow(self): """ Can produce a datatable with only a single quantitative dimension. The distribution is small enough no binning is needed. """ values = [0, 2, 3, 4, 6] quant_distribution = self.get_distribution(values) dataset = self.generate_messages_for_distribution( field_name='shared_count', distribution=quant_distribution, ) dimension = registry.get_dimension('shares') datatable = models.DataTable(dimension) result = datatable.render(dataset.message_set.all()) self.assertDistributionsEqual(result, quant_distribution, level_key='shares', measure_key='value')
def test_render_single_related_categorical(self): """Can produce a datatable with a single related categorical dimension.""" # Create some language labels language_ids = self.create_test_languages() language_distribution = self.get_distribution(language_ids) language_name_distribution = self.recover_related_field_distribution( language_distribution, corpus_models.Language, 'name') dataset = self.generate_messages_for_distribution( field_name='language_id', distribution=language_distribution, ) dimension = registry.get_dimension('language') datatable = models.DataTable(dimension) result = datatable.render(dataset.message_set.all()) self.assertDistributionsEqual(result, language_name_distribution, level_key='language', measure_key='value')
def precalc_categorical_dimension(dataset_id=1, dimension_key=None): datatable = datatable_models.DataTable(primary_dimension=dimension_key) dataset = Dataset.objects.get(id=dataset_id) # remove existing calculation PrecalcCategoricalDistribution.objects.filter( dataset=dataset, dimension_key=dimension_key).delete() result = datatable.generate(dataset) bulk = [] for bucket in result["table"]: level = bucket[dimension_key] if level is None: level = "" count = bucket["value"] obj = PrecalcCategoricalDistribution(dataset=dataset, dimension_key=dimension_key, level=level, count=count) bulk.append(obj) PrecalcCategoricalDistribution.objects.bulk_create(objs=bulk, batch_size=10000)
def test_excludes_all_data(self): """ If the filters exclude all the data, an empty result set should be produced. """ field_names = ('shared_count', 'replied_to_count') values = [(1, 1), (1, 4), (1, 3), (2, 1), (2, 2)] bi_distribution = self.get_distribution(values) dataset = self.generate_messages_for_multi_distribution( field_names, bi_distribution) d1 = registry.get_dimension('shares') d2 = registry.get_dimension('replies') datatable = models.DataTable(d1, d2) filtered = dataset.message_set.filter( shared_count__range=(2, 5), replied_to_count__range=(3, 5), ) result = datatable.render(filtered) self.assertEquals(result.count(), 0)
def test_create_with_one_dimension(self): """Can be created with only one dimension""" d1 = mock.Mock(spec=CategoricalDimension) datatable = models.DataTable(d1) self.assertIsNone(datatable.secondary_dimension)
def test_create_with_dimensions(self, get_dimension): """Accepts arguments that are dimensions""" d1 = mock.Mock(spec=CategoricalDimension) d2 = mock.Mock(spec=CategoricalDimension) datatable = models.DataTable(d1, d2) self.assertEquals(get_dimension.call_count, 0)
def test_create_with_keys(self, get_dimension): """If given strings, finds the matching dimensions.""" datatable = models.DataTable('foo', 'bar') get_dimension.assert_has_calls( [mock.call('foo'), mock.call('bar')], any_order=True)