예제 #1
0
    def test_double_quantitative_one_wide(self):
        """Can it render two quant dimensions, when one requires binning?"""
        values = [(0, 1), (2, 3), (6, 59999), (6, 60000)]
        quant_distribution = self.get_distribution(values)

        dataset = self.generate_messages_for_multi_distribution(
            ('shared_count', 'replied_to_count'), quant_distribution)

        binned_distribution = {
            (0, 0):
            quant_distribution[values[0]],
            (2, 0):
            quant_distribution[values[1]],
            (6, 59995):
            quant_distribution[values[2]] + quant_distribution[values[3]]
        }

        d1 = registry.get_dimension('shares')
        d2 = registry.get_dimension('replies')

        datatable = models.DataTable(d1, d2)
        result = datatable.render(dataset.message_set.all(),
                                  desired_secondary_bins=5)

        self.assertMultiDistributionsEqual(result,
                                           binned_distribution,
                                           ('shares', 'replies'),
                                           measure_key='value')
예제 #2
0
    def test_render_single_quantitative_wide(self):
        """
        Can produce a datatable with only a single quantitative dimension.
        The distribution is very wide and binning must be used.
        """

        values = [0, 2, 3, 4, 60000]
        quant_distribution = self.get_distribution(values)

        dataset = self.generate_messages_for_distribution(
            field_name='shared_count',
            distribution=quant_distribution,
        )

        binned_distribution = {
            0: sum(quant_distribution[value] for value in values[:4]),
            60000: quant_distribution[values[4]],
        }

        dimension = registry.get_dimension('shares')

        datatable = models.DataTable(dimension)
        result = datatable.render(dataset.message_set.all(),
                                  desired_primary_bins=5)

        self.assertDistributionsEqual(result,
                                      binned_distribution,
                                      level_key='shares',
                                      measure_key='value')
예제 #3
0
    def doDistributionTest(self, dimension_key, dataset, distribution,
                           **kwargs):
        dimension = registry.get_dimension(dimension_key)

        # Calculate the categorical distribution over the field name
        datatable = models.DataTable(dimension)
        result = datatable.render(dataset.message_set.all(), **kwargs)
        self.assertDistributionsEqual(result,
                                      distribution,
                                      level_key=dimension_key,
                                      measure_key='value')
예제 #4
0
    def post(self, request, format=None):
        add_history(self.request.user, 'data-table', request.data)

        input = serializers.DataTableSerializer(data=request.data)
        if input.is_valid():
            data = input.validated_data

            dataset = data['dataset']
            dimensions = data['dimensions']
            filters = data.get('filters', [])
            exclude = data.get('exclude', [])
            search_key = data.get('search_key')
            mode = data.get('mode')
            groups = data.get('groups', [])
            if len(groups) == 0:
                groups = None

            page_size = 100
            page = None
            if data.get('page_size'):
                page_size = data.get('page_size')
                page_size = max(1, int(data.get('page_size')))
            if data.get('page'):
                page = max(1, int(data.get('page')))

            if type(filters) == types.ListType and len(filters) == 0 and \
               type(exclude) == types.ListType and len(exclude) == 0 and len(dimensions) == 1 and dimensions[0].is_categorical():
                result = dataset.get_precalc_distribution(dimension=dimensions[0], search_key=search_key, page=page, page_size=page_size, mode=mode)

            else:

                datatable = datatable_models.DataTable(*dimensions)
                if mode is not None:
                    datatable.set_mode(mode)

                result = datatable.generate(dataset, filters, exclude, page_size, page, search_key, groups)

            # Just add the result key
            response_data = data
            response_data['result'] = result

            output = serializers.DataTableSerializer(response_data)
            return Response(output.data, status=status.HTTP_200_OK)

        return Response(input.errors, status=status.HTTP_400_BAD_REQUEST)
예제 #5
0
    def test_double_quantitative_narrow(self):
        """Can it render two quantitative dimensions when binning is not needed."""
        values = [(0, 1), (2, 3), (3, 2), (4, 5), (6, 7)]
        quant_distribution = self.get_distribution(values)

        dataset = self.generate_messages_for_multi_distribution(
            ('shared_count', 'replied_to_count'), quant_distribution)

        d1 = registry.get_dimension('shares')
        d2 = registry.get_dimension('replies')

        datatable = models.DataTable(d1, d2)
        result = datatable.render(dataset.message_set.all())

        self.assertMultiDistributionsEqual(result,
                                           quant_distribution,
                                           ('shares', 'replies'),
                                           measure_key='value')
예제 #6
0
    def test_render_single_categorical(self):
        """Can produce a datatable with a single categorical dimension."""

        values = [True, False]
        bool_distribution = self.get_distribution(values)

        dataset = self.generate_messages_for_distribution(
            field_name='contains_url',
            distribution=bool_distribution,
        )

        dimension = registry.get_dimension('contains_url')

        datatable = models.DataTable(dimension)
        result = datatable.render(dataset.message_set.all())

        self.assertDistributionsEqual(result,
                                      bool_distribution,
                                      level_key='contains_url',
                                      measure_key='value')
예제 #7
0
    def test_render_double_categorical(self):
        """Can produce a datatable with a two categorical dimensions."""

        field_names = ('contains_url', 'contains_mention')
        values = [(True, True), (True, False), (False, True), (False, False)]
        bi_bool_distribution = self.get_distribution(values)

        dataset = self.generate_messages_for_multi_distribution(
            field_names=field_names,
            distribution=bi_bool_distribution,
        )

        d1 = registry.get_dimension(field_names[0])
        d2 = registry.get_dimension(field_names[1])

        datatable = models.DataTable(d1, d2)
        result = datatable.render(dataset.message_set.all())

        self.assertMultiDistributionsEqual(result,
                                           bi_bool_distribution,
                                           field_names,
                                           measure_key='value')
예제 #8
0
    def test_render_two_related_categorical(self):
        """Can produce a datatable with two related categorical dimensions."""

        # Create some language labels
        language_ids = self.create_test_languages()
        dataset = self.create_authors_with_values(
            'username', ['username_%d' % d for d in xrange(5)])
        author_ids = dataset.person_set.values_list('id', flat=True).distinct()

        # create language/person pairs
        value_pairs = []
        for lang in language_ids:
            for author in author_ids:
                # skip cases where both are even, just so's there's gaps
                if lang % 2 == 0 and author % 2 == 0:
                    continue

                value_pairs.append((lang, author))

        # Distribute some messages
        id_distribution = self.get_distribution(value_pairs)
        self.generate_messages_for_multi_distribution(
            ('language_id', 'sender_id'), id_distribution, dataset=dataset)

        # Get the actual expected distribution
        value_distribution = self.convert_id_distribution_to_related(
            id_distribution, (corpus_models.Language, corpus_models.Person),
            ('name', 'username'))

        d1 = registry.get_dimension('language')
        d2 = registry.get_dimension('sender')

        datatable = models.DataTable(d1, d2)
        result = datatable.render(dataset.message_set.all())

        self.assertMultiDistributionsEqual(result,
                                           value_distribution,
                                           ('language', 'sender'),
                                           measure_key='value')
예제 #9
0
    def test_render_single_quantitative_narrow(self):
        """
        Can produce a datatable with only a single quantitative dimension.
        The distribution is small enough no binning is needed.
        """

        values = [0, 2, 3, 4, 6]
        quant_distribution = self.get_distribution(values)

        dataset = self.generate_messages_for_distribution(
            field_name='shared_count',
            distribution=quant_distribution,
        )

        dimension = registry.get_dimension('shares')

        datatable = models.DataTable(dimension)
        result = datatable.render(dataset.message_set.all())

        self.assertDistributionsEqual(result,
                                      quant_distribution,
                                      level_key='shares',
                                      measure_key='value')
예제 #10
0
    def test_render_single_related_categorical(self):
        """Can produce a datatable with a single related categorical dimension."""

        # Create some language labels
        language_ids = self.create_test_languages()
        language_distribution = self.get_distribution(language_ids)
        language_name_distribution = self.recover_related_field_distribution(
            language_distribution, corpus_models.Language, 'name')

        dataset = self.generate_messages_for_distribution(
            field_name='language_id',
            distribution=language_distribution,
        )

        dimension = registry.get_dimension('language')

        datatable = models.DataTable(dimension)
        result = datatable.render(dataset.message_set.all())

        self.assertDistributionsEqual(result,
                                      language_name_distribution,
                                      level_key='language',
                                      measure_key='value')
예제 #11
0
def precalc_categorical_dimension(dataset_id=1, dimension_key=None):
    datatable = datatable_models.DataTable(primary_dimension=dimension_key)
    dataset = Dataset.objects.get(id=dataset_id)

    # remove existing calculation
    PrecalcCategoricalDistribution.objects.filter(
        dataset=dataset, dimension_key=dimension_key).delete()

    result = datatable.generate(dataset)
    bulk = []
    for bucket in result["table"]:
        level = bucket[dimension_key]
        if level is None:
            level = ""
        count = bucket["value"]
        obj = PrecalcCategoricalDistribution(dataset=dataset,
                                             dimension_key=dimension_key,
                                             level=level,
                                             count=count)
        bulk.append(obj)

    PrecalcCategoricalDistribution.objects.bulk_create(objs=bulk,
                                                       batch_size=10000)
예제 #12
0
    def test_excludes_all_data(self):
        """
        If the filters exclude all the data, an empty result set should be produced.
        """

        field_names = ('shared_count', 'replied_to_count')
        values = [(1, 1), (1, 4), (1, 3), (2, 1), (2, 2)]
        bi_distribution = self.get_distribution(values)

        dataset = self.generate_messages_for_multi_distribution(
            field_names, bi_distribution)

        d1 = registry.get_dimension('shares')
        d2 = registry.get_dimension('replies')

        datatable = models.DataTable(d1, d2)

        filtered = dataset.message_set.filter(
            shared_count__range=(2, 5),
            replied_to_count__range=(3, 5),
        )

        result = datatable.render(filtered)
        self.assertEquals(result.count(), 0)
예제 #13
0
 def test_create_with_one_dimension(self):
     """Can be created with only one dimension"""
     d1 = mock.Mock(spec=CategoricalDimension)
     datatable = models.DataTable(d1)
     self.assertIsNone(datatable.secondary_dimension)
예제 #14
0
 def test_create_with_dimensions(self, get_dimension):
     """Accepts arguments that are dimensions"""
     d1 = mock.Mock(spec=CategoricalDimension)
     d2 = mock.Mock(spec=CategoricalDimension)
     datatable = models.DataTable(d1, d2)
     self.assertEquals(get_dimension.call_count, 0)
예제 #15
0
    def test_create_with_keys(self, get_dimension):
        """If given strings, finds the matching dimensions."""
        datatable = models.DataTable('foo', 'bar')

        get_dimension.assert_has_calls(
            [mock.call('foo'), mock.call('bar')], any_order=True)