def __init__(self, primary_dimension, secondary_dimension=None): """ Construct a DataTable for one or two dimensions. Dimensions may be string dimension keys or :class:`msgvis.apps.dimensions.models.CategoricalDimension` objects. :type primary_dimension: registry.models.CategoricalDimension :type secondary_dimension: registry.models.CategoricalDimension :return: """ # Look up the dimensions if needed if isinstance(primary_dimension, basestring): primary_dimension = registry.get_dimension(primary_dimension) if secondary_dimension is not None and isinstance(secondary_dimension, basestring): secondary_dimension = registry.get_dimension(secondary_dimension) # a dirty way if secondary_dimension is not None and hasattr(secondary_dimension, 'key') and secondary_dimension.key == "groups": secondary_dimension = None self.primary_dimension = primary_dimension self.secondary_dimension = secondary_dimension self.mode = "default"
def test_double_quantitative_one_wide(self): """Can it render two quant dimensions, when one requires binning?""" values = [(0, 1), (2, 3), (6, 59999), (6, 60000)] quant_distribution = self.get_distribution(values) dataset = self.generate_messages_for_multi_distribution( ('shared_count', 'replied_to_count'), quant_distribution) binned_distribution = { (0, 0): quant_distribution[values[0]], (2, 0): quant_distribution[values[1]], (6, 59995): quant_distribution[values[2]] + quant_distribution[values[3]] } d1 = registry.get_dimension('shares') d2 = registry.get_dimension('replies') datatable = models.DataTable(d1, d2) result = datatable.render(dataset.message_set.all(), desired_secondary_bins=5) self.assertMultiDistributionsEqual(result, binned_distribution, ('shares', 'replies'), measure_key='value')
def __init__(self, primary_dimension, secondary_dimension=None): """ Construct a DataTable for one or two dimensions. Dimensions may be string dimension keys or :class:`msgvis.apps.dimensions.models.CategoricalDimension` objects. :type primary_dimension: registry.models.CategoricalDimension :type secondary_dimension: registry.models.CategoricalDimension :return: """ # Look up the dimensions if needed if isinstance(primary_dimension, basestring): primary_dimension = registry.get_dimension(primary_dimension) if secondary_dimension is not None and isinstance( secondary_dimension, basestring): secondary_dimension = registry.get_dimension(secondary_dimension) # a dirty way if secondary_dimension is not None and hasattr( secondary_dimension, 'key') and secondary_dimension.key == "groups": secondary_dimension = None self.primary_dimension = primary_dimension self.secondary_dimension = secondary_dimension self.mode = "default"
def test_render_single_quantitative_wide(self): """ Can produce a datatable with only a single quantitative dimension. The distribution is very wide and binning must be used. """ values = [0, 2, 3, 4, 60000] quant_distribution = self.get_distribution(values) dataset = self.generate_messages_for_distribution( field_name='shared_count', distribution=quant_distribution, ) binned_distribution = { 0: sum(quant_distribution[value] for value in values[:4]), 60000: quant_distribution[values[4]], } dimension = registry.get_dimension('shares') datatable = models.DataTable(dimension) result = datatable.render(dataset.message_set.all(), desired_primary_bins=5) self.assertDistributionsEqual(result, binned_distribution, level_key='shares', measure_key='value')
def setUp(self): self.dimension = dimensions.get_dimension('time') self.dataset = corpus_models.Dataset.objects.create( name="test dataset", description='description') internal_filter = { 'dimension': self.dimension, 'min_time': now(), 'max_time': now() + timedelta(minutes=5), } serialized_filter = serializers.FilterSerializer(internal_filter).data self.serialized_representation = { 'dataset': self.dataset.id, 'dimensions': [self.dimension.key], 'filters': [serialized_filter], } # Should lookup exactly the same dimension self.deserialized_representation = { 'dataset': self.dataset, 'dimensions': [self.dimension], 'filters': [internal_filter], }
def test_boolean_domain(self): dataset = self.create_empty_dataset() dimension = registry.get_dimension("contains_url") result = dimension.get_domain(dataset.message_set.all()) result = list(result) self.assertEquals(len(result), 2) self.assertEquals(result, dimension.domain)
def test_double_quantitative_narrow(self): """Can it render two quantitative dimensions when binning is not needed.""" values = [(0, 1), (2, 3), (3, 2), (4, 5), (6, 7)] quant_distribution = self.get_distribution(values) dataset = self.generate_messages_for_multi_distribution( ('shared_count', 'replied_to_count'), quant_distribution) d1 = registry.get_dimension('shares') d2 = registry.get_dimension('replies') datatable = models.DataTable(d1, d2) result = datatable.render(dataset.message_set.all()) self.assertMultiDistributionsEqual(result, quant_distribution, ('shares', 'replies'), measure_key='value')
def setUp(self): self.dimension = dimensions.get_dimension('time') self.serialized_representation = { 'key': self.dimension.key, 'name': self.dimension.name, 'description': self.dimension.description, } # Should lookup exactly the same dimension self.deserialized_representation = self.dimension
def test_quantitative_domain(self): reply_values = [1, 2001] distribution = self.get_distribution(reply_values) dataset = self.generate_messages_for_distribution("replied_to_count", distribution) dimension = registry.get_dimension("replies") result = dimension.get_domain(dataset.message_set.all(), bins=10) result = list(result) self.assertEquals(result, [0, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000, 2200])
def generate_some_messages(self, dataset): corpus_models.Message.objects.create( dataset=dataset, text="blah blah blah", time="2015-02-02T01:19:02Z", shared_count=0, ) hashtag = corpus_models.Hashtag.objects.create(text="OurPriorities") msg = corpus_models.Message.objects.create( dataset=dataset, text="blah blah blah #%s" % hashtag.text, time="2015-02-02T01:19:02Z", shared_count=10, ) msg.hashtags.add(hashtag) self.dimension_time = registry.get_dimension('time') self.dimension_hashtags = registry.get_dimension('hashtags') self.dimension_shared = registry.get_dimension('shares')
def doDistributionTest(self, dimension_key, dataset, distribution, **kwargs): dimension = registry.get_dimension(dimension_key) # Calculate the categorical distribution over the field name datatable = models.DataTable(dimension) result = datatable.render(dataset.message_set.all(), **kwargs) self.assertDistributionsEqual(result, distribution, level_key=dimension_key, measure_key='value')
def setUp(self): self.dimension = self.dimension = dimensions.get_dimension('sentiment') self.internal_filter = { 'dimension': self.dimension, 'levels': ['a', 'b', 'c'], } self.external_filter = { 'dimension': self.dimension.key, 'levels': self.internal_filter['levels'] }
def test_render_double_categorical(self): """Can produce a datatable with a two categorical dimensions.""" field_names = ('contains_url', 'contains_mention') values = [(True, True), (True, False), (False, True), (False, False)] bi_bool_distribution = self.get_distribution(values) dataset = self.generate_messages_for_multi_distribution( field_names=field_names, distribution=bi_bool_distribution, ) d1 = registry.get_dimension(field_names[0]) d2 = registry.get_dimension(field_names[1]) datatable = models.DataTable(d1, d2) result = datatable.render(dataset.message_set.all()) self.assertMultiDistributionsEqual(result, bi_bool_distribution, field_names, measure_key='value')
def test_render_two_related_categorical(self): """Can produce a datatable with two related categorical dimensions.""" # Create some language labels language_ids = self.create_test_languages() dataset = self.create_authors_with_values( 'username', ['username_%d' % d for d in xrange(5)]) author_ids = dataset.person_set.values_list('id', flat=True).distinct() # create language/person pairs value_pairs = [] for lang in language_ids: for author in author_ids: # skip cases where both are even, just so's there's gaps if lang % 2 == 0 and author % 2 == 0: continue value_pairs.append((lang, author)) # Distribute some messages id_distribution = self.get_distribution(value_pairs) self.generate_messages_for_multi_distribution( ('language_id', 'sender_id'), id_distribution, dataset=dataset) # Get the actual expected distribution value_distribution = self.convert_id_distribution_to_related( id_distribution, (corpus_models.Language, corpus_models.Person), ('name', 'username')) d1 = registry.get_dimension('language') d2 = registry.get_dimension('sender') datatable = models.DataTable(d1, d2) result = datatable.render(dataset.message_set.all()) self.assertMultiDistributionsEqual(result, value_distribution, ('language', 'sender'), measure_key='value')
def setUp(self): self.dimension = dimensions.get_dimension('time') self.internal_filter = { 'dimension': self.dimension, 'min_time': now(), 'max_time': now() + timedelta(minutes=5), } self.external_filter = { 'dimension': self.dimension.key, 'min_time': api_time_format(self.internal_filter['min_time']), 'max_time': api_time_format(self.internal_filter['max_time']), }
def setUp(self): self.dimension = dimensions.get_dimension('replies') self.internal_filter = { 'dimension': self.dimension, 'min': 5, 'max': 10, } self.external_filter = { 'dimension': self.dimension.key, 'min': 5, 'max': 10, }
def test_excludes_all_data(self): """ If the filters exclude all the data, an empty result set should be produced. """ field_names = ('shared_count', 'replied_to_count') values = [(1, 1), (1, 4), (1, 3), (2, 1), (2, 2)] bi_distribution = self.get_distribution(values) dataset = self.generate_messages_for_multi_distribution( field_names, bi_distribution) d1 = registry.get_dimension('shares') d2 = registry.get_dimension('replies') datatable = models.DataTable(d1, d2) filtered = dataset.message_set.filter( shared_count__range=(2, 5), replied_to_count__range=(3, 5), ) result = datatable.render(filtered) self.assertEquals(result.count(), 0)
def test_time_domain(self): base_time = tz.datetime(2012, 5, 2, 20, 10, 2, 0) if settings.USE_TZ: base_time = base_time.replace(tzinfo=tz.utc) time_values = [base_time, base_time + timedelta(days=1)] distribution = self.get_distribution(time_values) dataset = self.generate_messages_for_distribution("time", distribution) dimension = registry.get_dimension("time") result = dimension.get_domain(dataset.message_set.all(), bins=24) self.assertEquals(len(result), 26) self.assertEquals(result[0], base_time.replace(minute=0, second=0)) self.assertEquals(result[24], time_values[1].replace(minute=0, second=0)) self.assertEquals(result[25], time_values[1].replace(minute=0, second=0) + timedelta(hours=1))
def test_time_domain_just_over(self): """If the domain is a little over a convenient bin size, rounds down""" # Four minutes and 10 seconds is a perfect 50 bins of 5 seconds # so this is 4 seconds extra. start_time = dateparse.parse_datetime("2014-03-21T00:00:00Z") end_time = dateparse.parse_datetime("2014-03-21T00:04:14Z") time_values = [start_time, end_time] distribution = self.get_distribution(time_values) dataset = self.generate_messages_for_distribution("time", distribution) dimension = registry.get_dimension("time") result = dimension.get_domain(dataset.message_set.all(), bins=50) # Should have decided to use 5 second increments self.assertEquals(result[0], start_time) self.assertEquals(result[1], start_time + timedelta(seconds=5))
def test_categorical_domain(self): """ Checks that the domain of a categorical model field, in this case Sentiment, can be calculated correctly. """ # Create some language labels sentiment_values, sentiment_labels = zip(*corpus_models.Message.SENTIMENT_CHOICES) sentiment_distribution = self.get_distribution(sentiment_values) dataset = self.generate_messages_for_distribution(field_name="sentiment", distribution=sentiment_distribution) dimension = registry.get_dimension("sentiment") # Calculate the categorical distribution over the field name result = dimension.get_domain(dataset.message_set.all()) # in order of CHOICES self.assertEquals(result, sentiment_values)
def test_render_single_categorical(self): """Can produce a datatable with a single categorical dimension.""" values = [True, False] bool_distribution = self.get_distribution(values) dataset = self.generate_messages_for_distribution( field_name='contains_url', distribution=bool_distribution, ) dimension = registry.get_dimension('contains_url') datatable = models.DataTable(dimension) result = datatable.render(dataset.message_set.all()) self.assertDistributionsEqual(result, bool_distribution, level_key='contains_url', measure_key='value')
def test_render_single_quantitative_narrow(self): """ Can produce a datatable with only a single quantitative dimension. The distribution is small enough no binning is needed. """ values = [0, 2, 3, 4, 6] quant_distribution = self.get_distribution(values) dataset = self.generate_messages_for_distribution( field_name='shared_count', distribution=quant_distribution, ) dimension = registry.get_dimension('shares') datatable = models.DataTable(dimension) result = datatable.render(dataset.message_set.all()) self.assertDistributionsEqual(result, quant_distribution, level_key='shares', measure_key='value')
def test_render_single_related_categorical(self): """Can produce a datatable with a single related categorical dimension.""" # Create some language labels language_ids = self.create_test_languages() language_distribution = self.get_distribution(language_ids) language_name_distribution = self.recover_related_field_distribution( language_distribution, corpus_models.Language, 'name') dataset = self.generate_messages_for_distribution( field_name='language_id', distribution=language_distribution, ) dimension = registry.get_dimension('language') datatable = models.DataTable(dimension) result = datatable.render(dataset.message_set.all()) self.assertDistributionsEqual(result, language_name_distribution, level_key='language', measure_key='value')
def setUp(self): self.dimension = dimensions.get_dimension('time') self.dataset = corpus_models.Dataset.objects.create(name="test dataset", description='description') internal_filter = { 'dimension': self.dimension, 'min_time': now(), 'max_time': now() + timedelta(minutes=5), } serialized_filter = serializers.FilterSerializer(internal_filter).data self.serialized_representation = { 'dataset': self.dataset.id, 'dimensions': [self.dimension.key], 'filters': [serialized_filter], } # Should lookup exactly the same dimension self.deserialized_representation = { 'dataset': self.dataset, 'dimensions': [self.dimension], 'filters': [internal_filter], }
def test_related_categorical_domain(self): """ Checks that the domain of a categorical related model field, in this case Language, can be calculated correctly. """ # Create some language labels languages = self.create_test_languages(model=True) language_ids = [lang.id for lang in languages] language_names = [lang.name for lang in languages] dimension = registry.get_dimension("language") # Generate a distribution where messages increase with each lang id language_distribution = self.get_distribution(language_ids) dataset = self.generate_messages_for_distribution(field_name="language_id", distribution=language_distribution) result = dimension.get_domain(dataset.message_set.all()) # results are in descending frequency order self.assertEquals(result, list(reversed(language_names))) # Generate another dataset with the distribution going the other way language_distribution = self.get_distribution(reversed(language_ids)) dataset = self.generate_messages_for_distribution(field_name="language_id", distribution=language_distribution) result = dimension.get_domain(dataset.message_set.all()) self.assertEquals(result, language_names)
def run_time_bin_test(self, delta, desired_bins, expected_bin_size): """Run a generic time bin test.""" t0 = self.base_time t1 = t0 + delta dimension = registry.get_dimension('time') self.assertEquals(dimension._get_bin_size(t0, t1, desired_bins), expected_bin_size)
def to_internal_value(self, data): return registry.get_dimension(data['key'])
def test_registry_rejects_unknown_keys(self): """Trying to get a dimension for a nonexistent key raises an exeption""" with self.assertRaises(KeyError): registry.get_dimension('made_up_dimension_key')
def test_registry_contains_dimension(self): """The registry should have some dimensions""" time = registry.get_dimension('time') self.assertIsNotNone(time) self.assertIsInstance(time, models.TimeDimension)