    def get_dataverse_count(self, **extra_filters):
        Return the Dataverse count -- a single number
        if self.was_error_found():
            return self.get_error_msg_return()

        filter_params = self.get_date_filter_params()
        if extra_filters:
            for k, v in extra_filters.items():
                filter_params[k] = v

        if self.include_harvested:
            q = Dataverse.objects.filter(**filter_params)
            q = Dataverse.objects.filter(**filter_params\

        sql_query = str(q.query)

        data_dict = OrderedDict()
        data_dict['count'] = q.count()
        data_dict['count_string'] = "{:,}".format(data_dict['count'])

        return StatsResult.build_success_result(data_dict, sql_query)
    def get_dataverse_count(self, **extra_filters):
        Return the Dataverse count -- a single number
        if self.was_error_found():
            return self.get_error_msg_return()

        filter_params = self.get_date_filter_params()
        if extra_filters:
            for k, v in extra_filters.items():
                filter_params[k] = v

        if self.include_harvested:
            q = Dataverse.objects.filter(**filter_params)
            q = Dataverse.objects.filter(**filter_params\

        sql_query = str(q.query)

        data_dict = OrderedDict()
        data_dict['count'] = q.count()
        data_dict['count_string'] = "{:,}".format(data_dict['count'])

        return StatsResult.build_success_result(data_dict, sql_query)
    def get_easy_file_downloads_by_month(self, **extra_filters):

        file_counts_by_month = self.get_easy_file_downloads_counts(
        running_total = self.get_easy_file_downloads_running_total(
        noncumulative = self.noncumulative

        formatted_records = []  # move from a queryset to a []

        for d in file_counts_by_month:

            year_month = d['yyyy_mm'][:7]
            year = int(d['yyyy_mm'][:4])
                month = int(d['yyyy_mm'][5:7])
                return StatsResult.build_error_result(
                    "in converting %s (month) into an integer (in get_easy_dataset_count_by_month)"
                    % d['yyyy_mm'][5:7])

            fmt_rec = OrderedDict()
            fmt_rec['yyyy_mm'] = year_month
            fmt_rec['count'] = d['count']

            # running total
            running_total += d['count']
            if noncumulative:
                fmt_rec['running_total'] = d['count']
                fmt_rec['running_total'] = running_total

            # Add year and month numbers
            fmt_rec['year_num'] = year
            fmt_rec['month_num'] = month

            # Add month name
            month_name_found, month_name_short = get_month_name_abbreviation(
            if month_name_found:
                assume_month_name_found, fmt_rec[
                    'month_name'] = get_month_name(month)
                fmt_rec['month_name_short'] = month_name_short
                    "no month name found for month %d (get_easy_file_downloads_by_month)"
                    % month)


        data_dict = OrderedDict()
        data_dict['total_downloads'] = running_total
        data_dict['record_count'] = len(formatted_records)
        data_dict['records'] = formatted_records

        return StatsResult.build_success_result(data_dict, None)
    def get_easy_counts_by_month(self, ds_counts_by_month, running_total,

        formatted_records = []

        for d in ds_counts_by_month:
            year_month = d['yyyy_mm'][:7]
            year = int(d['yyyy_mm'][:4])
                month = int(d['yyyy_mm'][5:7])
                return StatsResult.build_error_result(
                    "in converting %s (month) into an integer (in get_easy_dataset_count_by_month)"
                    % d['yyyy_mm'][5:7])

            fmt_dict = OrderedDict()
            fmt_dict['yyyy_mm'] = year_month
            fmt_dict['count'] = d['count']

            # running total
            running_total += d['count']
            if noncumulative:
                fmt_dict['running_total'] = d['count']
                fmt_dict['running_total'] = running_total

            # Add year and month numbers
            fmt_dict['year_num'] = year
            fmt_dict['month_num'] = month

            # Add month name
            month_name_found, month_name_short = get_month_name_abbreviation(

            if month_name_found:
                assume_month_name_found, fmt_dict[
                    'month_name'] = get_month_name(month)
                fmt_dict['month_name_short'] = month_name_short
                    "no month name found for month %d (get_easy_dataset_count_by_month)"
                    % month)

            # Add formatted record

        data_dict = OrderedDict()
        data_dict['record_count'] = len(formatted_records)
        data_dict['records'] = formatted_records

        return StatsResult.build_success_result(data_dict, None)
    def view_file_extensions_within_type(self, file_type=None):
        """View extensions for files based on their "Filemetadata.contenttype" value"""

        #file_type = 'data/various-formats'

        if file_type is None:
            # Retrieve list of **all** file names -- this could be too much!
            l = FileMetadata.objects.distinct('datafile__id', 'label'\
                    ).values_list('datafile__id', 'label')
            # Retrieve ids of Datafile filtered by "contenttype"
            ids = Datafile.objects.filter(contenttype=file_type).values_list(
                'dvobject__id', flat=True)

            # Retrieve the names of these Datafiles via the FileMetadata object
            l = FileMetadata.objects.filter(datafile__in=ids\
                    ).distinct('datafile__id', 'label'\
                    ).values_list('datafile__id', 'label')

        # Convert the file names to file extensions
        ext_list = [splitext(info[1])[-1] for info in l]

        # Make a dict counting the extensions
        extension_counts = {
        }  # {file extension : count, file ext : count, etc}
        for ext in ext_list:
            extension_counts[ext] = extension_counts.get(ext, 0) + 1

        # Sort the counts in descending order--highest count first
        ext_pairs = extension_counts.items()
        ext_pairs = sorted(ext_pairs, key=lambda k: k[1], reverse=True)

        ext_list = []
        total_count = sum(x[1] for x in ext_pairs) + 0.000
        for ext_pair in ext_pairs:
            d = OrderedDict(extension=ext_pair[0])
            d['count'] = ext_pair[1]
            d['total_count'] = int(total_count)
            d['percent_string'] = '{0:.3%}'.format(ext_pair[1] / total_count)

        data_dict = OrderedDict(number_unique_extensions=len(ext_pairs))
        data_dict['total_file_count'] = int(total_count)
        data_dict['record_count'] = len(ext_list)
        data_dict['records'] = ext_list
        data_dict['all_dv_files'] = Datafile.objects.all().count()
        data_dict['percent_unknown'] = '{0:.3%}'.format(
            total_count / data_dict['all_dv_files'])

        return StatsResult.build_success_result(data_dict)
    def get_stats_result(self, request):
        """Return the StatsResult object for this statistic"""

        dv_id = self.kwargs.get('ds_id', None)
        if dv_id is None:
            return StatsResult.build_error_result("No Dataset id specified", 400)

        # Get the latest version
        dataset_version = get_latest_dataset_version(dv_id)

        if dataset_version is None:
            return StatsResult.build_error_result('No published Dataset with id: %s' % dv_id, 404)

        dataset_as_json = DatasetSerializer(dataset_version).as_json()

        return StatsResult.build_success_result(dataset_as_json)
    def view_file_extensions_within_type(self, file_type=None):
        """View extensions for files based on their "Filemetadata.contenttype" value"""

        #file_type = 'data/various-formats'

        if file_type is None:
            # Retrieve list of **all** file names -- this could be too much!
            l = FileMetadata.objects.distinct('datafile__id', 'label'\
                    ).values_list('datafile__id', 'label')
            # Retrieve ids of Datafile filtered by "contenttype"
            ids = Datafile.objects.filter(contenttype=file_type).values_list('dvobject__id', flat=True)

            # Retrieve the names of these Datafiles via the FileMetadata object
            l = FileMetadata.objects.filter(datafile__in=ids\
                    ).distinct('datafile__id', 'label'\
                    ).values_list('datafile__id', 'label')

        # Convert the file names to file extensions
        ext_list = [splitext(info[1])[-1] for info in l]

        # Make a dict counting the extensions
        extension_counts = {}   # {file extension : count, file ext : count, etc}
        for ext in ext_list:
            extension_counts[ext] = extension_counts.get(ext, 0) + 1

        # Sort the counts in descending order--highest count first
        ext_pairs = extension_counts.items()
        ext_pairs = sorted(ext_pairs, key=lambda k: k[1], reverse=True)

        ext_list = []
        total_count = sum(x[1] for x in ext_pairs) + 0.000
        for ext_pair in ext_pairs:
            d = OrderedDict(extension=ext_pair[0])
            d['count'] = ext_pair[1]
            d['total_count'] = int(total_count)
            d['percent_string'] = '{0:.3%}'.format(ext_pair[1] / total_count)

        data_dict = OrderedDict(number_unique_extensions=len(ext_pairs))
        data_dict['total_file_count'] = int(total_count)
        data_dict['record_count'] = len(ext_list)
        data_dict['records'] = ext_list
        data_dict['all_dv_files'] = Datafile.objects.all().count()
        data_dict['percent_unknown'] = '{0:.3%}'.format(total_count/data_dict['all_dv_files'])

        return StatsResult.build_success_result(data_dict)
    def get_stats_result(self, request):
        """Return the StatsResult object for this statistic"""

        #dv_id = request.GET.get('id', None)
        alias = self.kwargs.get('alias', None)
        if alias is None:
            return StatsResult.build_error_result("No Dataverse 'alias' specified", 400)

            selected_dv = Dataverse.objects.select_related('dvobject').get(\
        except Dataverse.DoesNotExist:
            return StatsResult.build_error_result('No published Dataverse with alias: %s' % alias, 404)

        dataverse_as_json = DataverseSerializer(selected_dv).as_json()

        return StatsResult.build_success_result(dataverse_as_json)
    def get_stats_result(self, request):
        """Return the StatsResult object for this statistic"""

        dv_id = self.kwargs.get('ds_id', None)
        if dv_id is None:
            return StatsResult.build_error_result("No Dataset id specified",

        # Get the latest version
        dataset_version = get_latest_dataset_version(dv_id)

        if dataset_version is None:
            return StatsResult.build_error_result(
                'No published Dataset with id: %s' % dv_id, 404)

        dataset_as_json = DatasetSerializer(dataset_version).as_json()

        return StatsResult.build_success_result(dataset_as_json)
    def get_dataset_category_counts(self, **extra_filters):
        """Dataset counts by subjet"""

        # Was an error found earlier?
        if self.was_error_found():
            return self.get_error_msg_return()

            ds_values = self.get_easy_dataset_category_counts()
            ds_values = self.get_dataverse_dataset_subject_counts(

        # -----------------------------
        # Iterate through the vocab values,
        # process the totals, calculate percentage
        # -----------------------------
        running_total = 0
        formatted_records = []  # move from a queryset to a []
        total_count = sum([rec['cnt'] for rec in ds_values]) + 0.00

        for info in ds_values:
            rec = OrderedDict()
            rec['category'] = info['category']

            # count
            rec['count'] = info['cnt']
            rec['total_count'] = int(total_count)

            # percent
            float_percent = info['cnt'] / total_count
            rec['percent_string'] = '{0:.1%}'.format(float_percent)
            rec['percent_number'] = float("%.3f" % (float_percent))

            # total count


        data_dict = OrderedDict()
        data_dict['record_count'] = len(formatted_records)
        data_dict['records'] = formatted_records

        return StatsResult.build_success_result(data_dict)
    def get_total_file_downloads(self, **extra_filters):
        Get the total file download count
        if self.was_error_found():
            return self.get_error_msg_return()

        filter_params = self.get_date_filter_params(date_var_name='responsetime')


        # Narrow down to specific Dataverses
        if self.was_error_found():
            return self.get_error_msg_return()

        # Add extra filters, if they exist
        count_pre_dv4_downloads = False
        if extra_filters:
            for k, v in extra_filters.items():
                if k == INCLUDE_PRE_DV4_DOWNLOADS:    # skip this param
                    count_pre_dv4_downloads = True
                    del extra_filters[k]
                    filter_params[k] = v

        if count_pre_dv4_downloads:
            exclude_params = {}
            exclude_params = dict(responsetime__isnull=True)

        q = GuestBookResponse.objects.exclude(**exclude_params\

        sql_query = str(q.query)

        data_dict = OrderedDict()
        data_dict['count'] = q.count()
        data_dict['count_string'] = "{:,}".format(data_dict['count'])

        return StatsResult.build_success_result(data_dict, sql_query)
    def get_total_file_downloads(self, **extra_filters):
        Get the total file download count
        if self.was_error_found():
            return self.get_error_msg_return()

        filter_params = self.get_date_filter_params(


        # Narrow down to specific Dataverses
        if self.was_error_found():
            return self.get_error_msg_return()

        # Add extra filters, if they exist
        count_pre_dv4_downloads = False
        if extra_filters:
            for k, v in extra_filters.items():
                if k == INCLUDE_PRE_DV4_DOWNLOADS:  # skip this param
                    count_pre_dv4_downloads = True
                    del extra_filters[k]
                    filter_params[k] = v

        if count_pre_dv4_downloads:
            exclude_params = {}
            exclude_params = dict(responsetime__isnull=True)

        q = GuestBookResponse.objects.exclude(**exclude_params\

        sql_query = str(q.query)

        data_dict = OrderedDict()
        data_dict['count'] = q.count()
        data_dict['count_string'] = "{:,}".format(data_dict['count'])

        return StatsResult.build_success_result(data_dict, sql_query)
    def get_dataset_count(self, **extra_filters):
        Return the Dataset count
        if self.was_error_found():
            return self.get_error_msg_return()

        filter_params = self.get_date_filter_params()
        if extra_filters:
            for k, v in extra_filters.items():
                filter_params[k] = v

        q = Dataset.objects.filter(**filter_params)
        sql_query = str(q.query)

        data_dict = OrderedDict()
        data_dict['count'] = q.count()
        data_dict['count_string'] = "{:,}".format(data_dict['count'])

        return StatsResult.build_success_result(data_dict, sql_query)
    def get_datafile_count(self, **extra_filters):
        Return the Datafile count
        if self.was_error_found():
            return self.get_error_msg_return()

        filter_params = self.get_date_filter_params()

        # Add extra filters, if they exist
        if extra_filters:
            for k, v in extra_filters.items():
                filter_params[k] = v

        q = Datafile.objects.filter(**filter_params)
        sql_query = str(q.query)

        data_dict = OrderedDict()
        data_dict['count'] = q.count()
        data_dict['count_string'] = "{:,}".format(data_dict['count'])

        return StatsResult.build_success_result(data_dict, sql_query)
    def get_stats_result(self, request):
        """Return the StatsResult object for this statistic"""
        persistent_id = request.GET.get('persistentId', None)
        if persistent_id is None:
            return StatsResult.build_error_result(
                "No Dataset persistent id specified", 400)

        ds = Dataset.get_dataset_by_persistent_id(persistent_id)

        err_404 = 'No published dataset found for persistentId: %s' % persistent_id

        if ds is None or not ds.dvobject.publicationdate:
            return StatsResult.build_error_result(err_404, 404)

        # Get the latest version
        dataset_version = get_latest_dataset_version(ds.dvobject.id)

        if dataset_version is None:
            return StatsResult.build_error_result(err_404, 404)

        dataset_as_json = DatasetSerializer(dataset_version).as_json()

        return StatsResult.build_success_result(dataset_as_json)
    def get_stats_result(self, request):
        """Return the StatsResult object for this statistic"""
        persistent_id = request.GET.get('persistentId', None)
        if persistent_id is None:
            return StatsResult.build_error_result("No Dataset persistent id specified", 400)

        ds = Dataset.get_dataset_by_persistent_id(persistent_id)

        err_404 = 'No published dataset found for persistentId: %s' % persistent_id

        if ds is None or not ds.dvobject.publicationdate:
            return StatsResult.build_error_result(err_404, 404)

        # Get the latest version
        dataset_version = get_latest_dataset_version(ds.dvobject.id)

        if dataset_version is None:
            return StatsResult.build_error_result(err_404, 404)

        dataset_as_json = DatasetSerializer(dataset_version).as_json()

        return StatsResult.build_success_result(dataset_as_json)
    def get_dataverse_affiliation_counts(self, **extra_filters):
        Return Dataverse counts by affiliation

        Returns: dv_counts_by_affiliation": [
                "affiliation": "University of Oxford",
                "affiliation_count": 2,
                "total_count": 191,
                "percent_string": "1.0%"
                "affiliation": "University of Illinois",
                "affiliation_count": 1,
                "total_count": 191,
                "percent_string": "0.5%"
        if self.was_error_found():
            return self.get_error_msg_return()

        # Retrieve the date parameters
        filter_params = self.get_date_filter_params(DVOBJECT_CREATEDATE_ATTR)

        if extra_filters:
            for k, v in extra_filters.items():
                filter_params[k] = v

        dataverse_counts_by_affil = Dataverse.objects.select_related('dvobject'\

        # -----------------------------------
        # Get SQL query string
        # -----------------------------------
        sql_query = str(dataverse_counts_by_affil.query)

        # Count all dataverses
        total_count = sum([rec.get('affiliation_count', 0) for rec in dataverse_counts_by_affil])
        total_count = total_count + 0.0

        print 'dataverse_counts_by_affil', dataverse_counts_by_affil

        # Format the records, adding 'total_count' and 'percent_string' to each one
        formatted_records = []
        for rec in dataverse_counts_by_affil:
            if rec.get('affiliation_count', 0) > 0:
                fmt_dict = OrderedDict()
                affil_str = rec.get('affiliation', None)
                if affil_str is not None:
                    affil_str = affil_str.encode('utf-8')
                fmt_dict['affiliation'] = affil_str

                fmt_dict['affiliation_count'] = rec.get('affiliation_count', 0)

                if total_count > 0:
                    float_percent = rec.get('affiliation_count', 0) / total_count
                    fmt_dict['total_count'] = int(total_count)
                    fmt_dict['percent_string'] = '{0:.1%}'.format(float_percent)
                    fmt_dict['total_count'] = 0
                    fmt_dict['percent_string'] = '0%'


        data_dict = OrderedDict()
        data_dict['record_count'] = len(formatted_records)
        data_dict['records'] = formatted_records

        return StatsResult.build_success_result(data_dict, sql_query)
    def get_dataverse_counts_by_type(self, exclude_uncategorized=True, **extra_filters):
        Return dataverse counts by 'dataversetype'

        Optional if a dataverse is uncategorized:
            - Specifying 'uncategorized_replacement_name' will
                set "UNCATEGORIZED" to another string

        Returns: { "dv_counts_by_type": [
                            "dataversetype": "RESEARCH_PROJECTS",
                            "type_count": 85,
                            "total_count": 356,
                            "percent_string": "23.9%"
                            "dataversetype": "TEACHING_COURSES",
                            "type_count": 10,
                            "total_count": 356,
                            "percent_string": "2.8%"
                            ... etc
        if self.was_error_found():
            return self.get_error_msg_return()

        # Retrieve the date parameters
        filter_params = self.get_date_filter_params(DVOBJECT_CREATEDATE_ATTR)

        # Add extra filters
        if extra_filters:
            for k, v in extra_filters.items():
                filter_params[k] = v

        if exclude_uncategorized:
            exclude_params = dict(dataversetype=DATAVERSE_TYPE_UNCATEGORIZED)
            exclude_params = {}

        dataverse_counts_by_type = Dataverse.objects.select_related('dvobject'\

        # -----------------------------------
        # Get SQL query string
        # -----------------------------------
        sql_query = str(dataverse_counts_by_type.query)

        # Count all dataverses
        total_count = sum([rec.get('type_count', 0) for rec in dataverse_counts_by_type])
        total_count = total_count + 0.0

        # Format the records, adding 'total_count' and 'percent_string' to each one
        formatted_records = []
        for rec in dataverse_counts_by_type:
            fmt_dict = OrderedDict()
            fmt_dict['dataversetype'] = rec['dataversetype']
            fmt_dict['dataversetype_label'] = rec['dataversetype'].replace('_', ' ')
            fmt_dict['type_count'] = rec.get('type_count', 0)

            if total_count > 0:
                float_percent = rec.get('type_count', 0) / total_count
                fmt_dict['total_count'] = int(total_count)
                fmt_dict['percent_string'] = '{0:.1%}'.format(float_percent)
                fmt_dict['total_count'] = 0
                fmt_dict['percent_string'] = '0%'


        data_dict = OrderedDict()
        data_dict['record_count'] = len(formatted_records)
        data_dict['records'] = formatted_records

        return StatsResult.build_success_result(data_dict, sql_query)
    def get_dataverse_counts_by_month(self, date_param=DVOBJECT_CREATEDATE_ATTR, **extra_filters):
        Return Dataverse counts by month
        # Was an error found earlier?
        if self.was_error_found():
            return self.get_error_msg_return()

        # -----------------------------------
        # (1) Build query filters
        # -----------------------------------

        # Exclude records where dates are null
        #   - e.g. a record may not have a publication date

        exclude_params = { '%s__isnull' % date_param : True}
        if self.include_harvested:
            exclude_params['dvobject__id__in'] = self.get_harvested_dataverse_ids()

        # Retrieve the date parameters
        filter_params = self.get_date_filter_params()

        # Add extra filters from kwargs
        if extra_filters:
            for k, v in extra_filters.items():
                filter_params[k] = v

        # -----------------------------------
        # (2) Construct query
        # -----------------------------------

        # add exclude filters date filters
        dv_counts_by_month = Dataverse.objects.select_related('dvobject'\

        # annotate query adding "month_year" and "count"
        dv_counts_by_month = dv_counts_by_month.annotate(\
            yyyy_mm=TruncYearMonth('%s' % date_param)\
            ).values('yyyy_mm', 'count'\
            ).order_by('%syyyy_mm' % self.time_sort)

        # -----------------------------------
        # (2a) Get SQL query string
        # -----------------------------------
        sql_query = str(dv_counts_by_month.query)

        # -----------------------------------
        # (3) Format results
        # -----------------------------------
        # hold the running total count
        running_total = self.get_dataverse_count_start_point(**extra_filters)
        formatted_records = []  # move from a queryset to a []

        for d in dv_counts_by_month:
            rec_fmt = OrderedDict()

            # change the datetime object to a string
            rec_fmt['yyyy_mm'] = d['yyyy_mm'].strftime('%Y-%m')
            rec_fmt['count'] = d['count']

            # running total
            running_total += d['count']
            rec_fmt['running_total'] = running_total
            # d['month_year'] = d['yyyy_mm'].strftime('%Y-%m')

            # Add year and month numbers
            rec_fmt['year_num'] = d['yyyy_mm'].year
            rec_fmt['month_num'] = d['yyyy_mm'].month

            # Add month name
            month_name_found, month_name_short = get_month_name_abbreviation(d['yyyy_mm'].month)
            if month_name_found:
                assume_month_name_found, rec_fmt['month_name'] = get_month_name(d['yyyy_mm'].month)
                rec_fmt['month_name_short'] = month_name_short
                # Log it!!!!!!

            # Add formatted record

        data_dict = OrderedDict()
        data_dict['record_count'] = len(formatted_records)
        data_dict['total_count'] = running_total
        data_dict['records'] = formatted_records

        return StatsResult.build_success_result(data_dict, sql_query)
    def get_file_counts_per_dataset_latest_versions(self, **extra_filters):
        Get binning stats for the number of files in each Dataset.
        For the counts, only use the LATEST DatasetVersion

        # Get the correct DatasetVersion ids as a filter parameter
        latest_dsv_ids = self.get_dataset_version_ids(**extra_filters)
        filter_params = dict(datasetversion__id__in=latest_dsv_ids)

        # Make query
        ds_version_counts = FileMetadata.objects.filter(**filter_params\
                            ).values('dsv_id', 'cnt'\

        # Convert to Dataframe
        df = pd.DataFrame(list(ds_version_counts), columns = ['dsv_id', 'cnt'])

        # Get the list of bins
        high_num = high_num=df['cnt'].max() + self.bin_size
        bins = self.get_bin_list(step=self.bin_size, low_num=0, high_num=high_num+self.bin_size)

        # Add a new column, assigning each file count to a bin
        df['bin_label'] = pd.cut(df['cnt'], bins)

        # Count the occurrence of each bin
        bin_count_series = pd.value_counts(df['bin_label'])

        # Make the Series into a new DataFrame
        df_bins = pd.DataFrame(dict(bin=bin_count_series.index,\

        # Add a sort key
        # (0, 20] -> 0
        # (20, 30] -> 20
        # etc
        df_bins['sort_key'] = df_bins['bin'].apply(lambda x: int(x[1:-1].split(',')[0]))
        df_bins['bin_start_inclusive'] = df_bins['sort_key']
        df_bins['bin_end'] = df_bins['bin'].apply(lambda x: int(x[1:-1].split(',')[1]))

        # Add a formatted string
        # (0, 20] -> 0 to 20
        # (20, 30] -> 20 to 30
        # etc
        df_bins['bin_str'] = df_bins['bin'].apply(lambda x: x[1:-1].replace(', ', ' to '))

        # Sort the bins
        df_bins = df_bins.sort('sort_key')


        # If appropriate, skip empty bins, e.g. remove 0 counts
        if self.skip_empty_bins:
            df_bins = df_bins.query('count != 0')

        # Return as python dict
        #   # bit expensive but want orderedDict
        formatted_records_json = df_bins.to_json(orient='records')
        formatted_records = json.loads(formatted_records_json, object_pairs_hook=OrderedDict)

        data_dict = OrderedDict()
        data_dict['record_count'] = len(formatted_records)
        data_dict['records'] = formatted_records

        return StatsResult.build_success_result(data_dict)
    def get_datafile_content_type_counts(self, **extra_filters):
        Return datafile counts by 'content type'

        "datafile_content_type_counts": [
                    "total_count": 1584,
                    "contenttype": "text/tab-separated-values",
                    "type_count": 187,
                    "percent_string": "11.8%"
                    "total_count": 1584,
                    "contenttype": "image/jpeg",
                    "type_count": 182,
                    "percent_string": "11.5%"
                    "total_count": 1584,
                    "contenttype": "text/plain",
                    "type_count": 147,
                    "percent_string": "9.3%"
        if self.was_error_found():
            return self.get_error_msg_return()

        # Retrieve the date parameters
        filter_params = self.get_date_filter_params(DVOBJECT_CREATEDATE_ATTR)

        # Add extra filters
        if extra_filters:
            for k, v in extra_filters.items():
                filter_params[k] = v

        datafile_counts_by_type = Datafile.objects.select_related('dvobject'\

        sql_query = str(datafile_counts_by_type.query)

        # Count all dataverses
        total_count = sum([rec.get('type_count', 0) for rec in datafile_counts_by_type])
        total_count = total_count + 0.0

        # Format the records, adding 'total_count' and 'percent_string' to each one
        formatted_records = []
        #num = 0
        for rec in datafile_counts_by_type:

            if total_count > 0:
                fmt_dict = OrderedDict()
                fmt_dict['contenttype'] = rec['contenttype']

                # short contenttype
                contenttype_parts = rec['contenttype'].split('/')
                if len(contenttype_parts) > 1:
                    fmt_dict['short_content_type'] = '/'.join(contenttype_parts[1:])
                    fmt_dict['short_content_type'] = rec['contenttype']

                fmt_dict['type_count'] = rec.get('type_count', 0)

                float_percent = fmt_dict['type_count'] / total_count
                fmt_dict['total_count'] = int(total_count)
                fmt_dict['percent_string'] = '{0:.1%}'.format(float_percent)


        data_dict = OrderedDict()
        data_dict['record_count'] = len(formatted_records)
        data_dict['records'] = formatted_records

        return StatsResult.build_success_result(data_dict, sql_query)
    def get_datafile_content_type_counts(self, **extra_filters):
        Return datafile counts by 'content type'

        "datafile_content_type_counts": [
                    "total_count": 1584,
                    "contenttype": "text/tab-separated-values",
                    "type_count": 187,
                    "percent_string": "11.8%"
                    "total_count": 1584,
                    "contenttype": "image/jpeg",
                    "type_count": 182,
                    "percent_string": "11.5%"
                    "total_count": 1584,
                    "contenttype": "text/plain",
                    "type_count": 147,
                    "percent_string": "9.3%"
        if self.was_error_found():
            return self.get_error_msg_return()

        # Retrieve the date parameters
        filter_params = self.get_date_filter_params(DVOBJECT_CREATEDATE_ATTR)

        # Add extra filters
        if extra_filters:
            for k, v in extra_filters.items():
                filter_params[k] = v

        datafile_counts_by_type = Datafile.objects.select_related('dvobject'\

        sql_query = str(datafile_counts_by_type.query)

        # Count all dataverses
        total_count = sum(
            [rec.get('type_count', 0) for rec in datafile_counts_by_type])
        total_count = total_count + 0.0

        # Format the records, adding 'total_count' and 'percent_string' to each one
        formatted_records = []
        #num = 0
        for rec in datafile_counts_by_type:

            if total_count > 0:
                fmt_dict = OrderedDict()
                fmt_dict['contenttype'] = rec['contenttype']

                # short contenttype
                contenttype_parts = rec['contenttype'].split('/')
                if len(contenttype_parts) > 1:
                    fmt_dict['short_content_type'] = '/'.join(
                    fmt_dict['short_content_type'] = rec['contenttype']

                fmt_dict['type_count'] = rec.get('type_count', 0)

                float_percent = fmt_dict['type_count'] / total_count
                fmt_dict['total_count'] = int(total_count)
                fmt_dict['percent_string'] = '{0:.1%}'.format(float_percent)


        data_dict = OrderedDict()
        data_dict['record_count'] = len(formatted_records)
        data_dict['records'] = formatted_records

        return StatsResult.build_success_result(data_dict, sql_query)
    def get_file_downloads_by_month(self, **extra_filters):
        Using the GuestBookResponse object, find the number of file
        downloads per month
        if self.was_error_found():
            return self.get_error_msg_return()

        filter_params = self.get_date_filter_params(


        # Narrow down to specific Dataverses
        if self.was_error_found():
            return self.get_error_msg_return()

        # Add extra filters, if they exist
        count_pre_dv4_downloads = False
        if extra_filters:
            for k, v in extra_filters.items():
                if k == INCLUDE_PRE_DV4_DOWNLOADS:  # skip this param
                    count_pre_dv4_downloads = True
                    del extra_filters[k]
                    filter_params[k] = v

        file_counts_by_month = GuestBookResponse.objects.exclude(\
            ).values('yyyy_mm', 'count'\
            ).order_by('%syyyy_mm' % self.time_sort)

        #print 'file_counts_by_month.query', file_counts_by_month.query
        sql_query = str(file_counts_by_month.query)

        formatted_records = []  # move from a queryset to a []

        if count_pre_dv4_downloads:
            file_running_total = self.get_file_download_start_point_include_undated(
            file_running_total = self.get_file_download_start_point(

        for d in file_counts_by_month:
            fmt_rec = OrderedDict()
            fmt_rec['yyyy_mm'] = d['yyyy_mm'].strftime('%Y-%m')
            fmt_rec['count'] = d['count']

            file_running_total += d['count']
            fmt_rec['running_total'] = file_running_total

            # d['month_year'] = d['yyyy_mm'].strftime('%Y-%m')

            # Add year and month numbers
            fmt_rec['year_num'] = d['yyyy_mm'].year
            fmt_rec['month_num'] = d['yyyy_mm'].month

            # Add month name
            month_name_found, month_name_short = get_month_name_abbreviation(
            if month_name_found:
                assume_month_name_found, fmt_rec[
                    'month_name'] = get_month_name(d['yyyy_mm'].month)
                fmt_rec['month_name_short'] = month_name_short
                # Log it!!!!!!


        data_dict = OrderedDict()
        data_dict['total_downloads'] = file_running_total
        data_dict['record_count'] = len(formatted_records)
        data_dict['records'] = formatted_records

        return StatsResult.build_success_result(data_dict, sql_query)
    def get_file_downloads_by_month(self, **extra_filters):
        Using the GuestBookResponse object, find the number of file
        downloads per month
        if self.was_error_found():
            return self.get_error_msg_return()

        filter_params = self.get_date_filter_params(date_var_name='responsetime')


        # Narrow down to specific Dataverses
        if self.was_error_found():
            return self.get_error_msg_return()

        # Add extra filters, if they exist
        count_pre_dv4_downloads = False
        if extra_filters:
            for k, v in extra_filters.items():
                if k == INCLUDE_PRE_DV4_DOWNLOADS:    # skip this param
                    count_pre_dv4_downloads = True
                    del extra_filters[k]
                    filter_params[k] = v

        file_counts_by_month = GuestBookResponse.objects.exclude(\
            ).values('yyyy_mm', 'count'\
            ).order_by('%syyyy_mm' % self.time_sort)

        #print 'file_counts_by_month.query', file_counts_by_month.query
        sql_query = str(file_counts_by_month.query)

        formatted_records = []  # move from a queryset to a []

        if count_pre_dv4_downloads:
            file_running_total = self.get_file_download_start_point_include_undated(**extra_filters)
            file_running_total = self.get_file_download_start_point(**extra_filters)

        for d in file_counts_by_month:
            fmt_rec = OrderedDict()
            fmt_rec['yyyy_mm'] = d['yyyy_mm'].strftime('%Y-%m')
            fmt_rec['count'] = d['count']

            file_running_total += d['count']
            fmt_rec['running_total'] = file_running_total

            # d['month_year'] = d['yyyy_mm'].strftime('%Y-%m')

            # Add year and month numbers
            fmt_rec['year_num'] = d['yyyy_mm'].year
            fmt_rec['month_num'] = d['yyyy_mm'].month

            # Add month name
            month_name_found, month_name_short = get_month_name_abbreviation( d['yyyy_mm'].month)
            if month_name_found:
                assume_month_name_found, fmt_rec['month_name'] = get_month_name(d['yyyy_mm'].month)
                fmt_rec['month_name_short'] = month_name_short
                # Log it!!!!!!


        data_dict = OrderedDict()
        data_dict['total_downloads'] = file_running_total
        data_dict['record_count'] = len(formatted_records)
        data_dict['records'] = formatted_records

        return StatsResult.build_success_result(data_dict, sql_query)
    def get_file_count_by_month(self, date_param=DVOBJECT_CREATEDATE_ATTR, **extra_filters):
        File counts by month
        # Was an error found earlier?
        if self.was_error_found():
            return self.get_error_msg_return()

        # -----------------------------------
        # (1) Build query filters
        # -----------------------------------

        # Exclude records where dates are null
        #   - e.g. a record may not have a publication date
        if date_param == DVOBJECT_CREATEDATE_ATTR:
            exclude_params = {}
            exclude_params = { '%s__isnull' % date_param : True}

        # Retrieve the date parameters
        filter_params = self.get_date_filter_params()

        # Add extra filters from kwargs
        if extra_filters:
            for k, v in extra_filters.items():
                filter_params[k] = v

        # -----------------------------------
        # (2) Construct query
        # -----------------------------------

        # add exclude filters date filters
        file_counts_by_month = Datafile.objects.select_related('dvobject'\

        # annotate query adding "month_year" and "cnt"
        file_counts_by_month = file_counts_by_month.annotate(\
            yyyy_mm=TruncYearMonth('%s' % date_param)\
            ).values('yyyy_mm', 'count', 'bytes'\
            ).order_by('%syyyy_mm' % self.time_sort)

        sql_query = str(file_counts_by_month.query)

        # -----------------------------------
        # (3) Format results
        # -----------------------------------
        running_total = self.get_file_count_start_point(**extra_filters)   # hold the running total count
        total_bytes = 0
        formatted_records = []  # move from a queryset to a []

        for d in file_counts_by_month:
            fmt_rec = OrderedDict()

            fmt_rec['yyyy_mm'] = d['yyyy_mm'].strftime('%Y-%m')
            fmt_rec['count'] = d['count']
            fmt_rec['bytes'] = d['bytes']
            fmt_rec['bytes_str'] = comma_sep_number(d['bytes'])

            total_bytes += d['bytes']
            # running total
            running_total += d['count']
            fmt_rec['running_total'] = running_total

            # d['month_year'] = d['yyyy_mm'].strftime('%Y-%m')

            # Add year and month numbers
            fmt_rec['year_num'] = d['yyyy_mm'].year
            fmt_rec['month_num'] = d['yyyy_mm'].month

            # Add month name
            month_name_found, month_name_short = get_month_name_abbreviation(d['yyyy_mm'].month)
            if month_name_found:
                assume_month_name_found, fmt_rec['month_name'] = get_month_name(d['yyyy_mm'].month)
                fmt_rec['month_name_short'] = month_name_short
                # Log it!!!!!!

            # Add formatted record

        data_dict = OrderedDict()
        data_dict['record_count'] = len(formatted_records)
        data_dict['records'] = formatted_records
        data_dict['total_bytes'] = total_bytes
        data_dict['total_bytes_str'] = comma_sep_number(total_bytes)

        return StatsResult.build_success_result(data_dict, sql_query)
    def get_dataverse_affiliation_counts(self, **extra_filters):
        Return Dataverse counts by affiliation

        Returns: dv_counts_by_affiliation": [
                "affiliation": "University of Oxford",
                "affiliation_count": 2,
                "total_count": 191,
                "percent_string": "1.0%"
                "affiliation": "University of Illinois",
                "affiliation_count": 1,
                "total_count": 191,
                "percent_string": "0.5%"
        if self.was_error_found():
            return self.get_error_msg_return()

        # Retrieve the date parameters
        filter_params = self.get_date_filter_params(DVOBJECT_CREATEDATE_ATTR)

        if extra_filters:
            for k, v in extra_filters.items():
                filter_params[k] = v

        dataverse_counts_by_affil = Dataverse.objects.select_related('dvobject'\

        # -----------------------------------
        # Get SQL query string
        # -----------------------------------
        sql_query = str(dataverse_counts_by_affil.query)

        # Count all dataverses
        total_count = sum([
            rec.get('affiliation_count', 0)
            for rec in dataverse_counts_by_affil
        total_count = total_count + 0.0

        print 'dataverse_counts_by_affil', dataverse_counts_by_affil

        # Format the records, adding 'total_count' and 'percent_string' to each one
        formatted_records = []
        for rec in dataverse_counts_by_affil:
            if rec.get('affiliation_count', 0) > 0:
                fmt_dict = OrderedDict()
                affil_str = rec.get('affiliation', None)
                if affil_str is not None:
                    affil_str = affil_str.encode('utf-8')
                fmt_dict['affiliation'] = affil_str

                fmt_dict['affiliation_count'] = rec.get('affiliation_count', 0)

                if total_count > 0:
                    float_percent = rec.get('affiliation_count',
                                            0) / total_count
                    fmt_dict['total_count'] = int(total_count)
                    fmt_dict['percent_string'] = '{0:.1%}'.format(
                    fmt_dict['total_count'] = 0
                    fmt_dict['percent_string'] = '0%'


        data_dict = OrderedDict()
        data_dict['record_count'] = len(formatted_records)
        data_dict['records'] = formatted_records

        return StatsResult.build_success_result(data_dict, sql_query)
    def get_dataset_subject_counts(self, **extra_filters):
        """Dataset counts by subjet"""

        # Was an error found earlier?
        if self.was_error_found():
            return self.get_error_msg_return()

        # -----------------------------------
        # (1) Build query filters
        # -----------------------------------

        # Retrieve the date parameters
        # -----------------------------------
        filter_params = self.get_date_filter_params()

        # -----------------------------------
        # Add extra filters from kwargs
        # -----------------------------------
        if extra_filters:
            for k, v in extra_filters.items():
                filter_params[k] = v

        # -----------------------------
        # Get the DatasetFieldType for subject
        # -----------------------------
        search_attrs = dict(name='subject',\
            ds_field_type = DatasetFieldType.objects.get(**search_attrs)
        except DatasetFieldType.DoesNotExist:
            return False, 'DatasetFieldType for Citation title not found.  (kwargs: %s)' % search_attrs

        # -----------------------------
        # Retrieve Dataset ids by time and published/unpublished
        # -----------------------------
        dataset_ids = Dataset.objects.select_related('dvobject'\
                        ).values_list('dvobject__id', flat=True)

        # -----------------------------
        # Get latest DatasetVersion ids
        # -----------------------------
        id_info_list = DatasetVersion.objects.filter(\
            ).values('id', 'dataset_id', 'versionnumber', 'minorversionnumber'\
            ).order_by('dataset_id', '-id', '-versionnumber', '-minorversionnumber')

        # -----------------------------
        # Iterate through and get the DatasetVersion id
        #        of the latest version
        # -----------------------------
        dsv_ids = []
        last_dataset_id = None
        for idx, info in enumerate(id_info_list):
            if idx == 0 or info['dataset_id'] != last_dataset_id:

            last_dataset_id = info['dataset_id']

        # -----------------------------
        # Get the DatasetField ids
        # -----------------------------
        search_attrs2 = dict(datasetversion__id__in=dsv_ids,\
        ds_field_ids = DatasetField.objects.select_related(

        # -----------------------------
        # Finally, get the ControlledVocabularyValues
        # -----------------------------
        ds_values = DatasetFieldControlledVocabularyValue.objects.select_related('controlledvocabularyvalues'\
            ).values('subject', 'cnt'\

        # -----------------------------
        # Iterate through the vocab values,
        # process the totals, calculate percentage
        # -----------------------------
        running_total = 0
        formatted_records = []  # move from a queryset to a []
        total_count = sum([rec['cnt'] for rec in ds_values]) + 0.00

        for info in ds_values:
            rec = OrderedDict()
            rec['subject'] = info['subject']

            # count
            rec['count'] = info['cnt']
            rec['total_count'] = int(total_count)

            # percent
            float_percent = info['cnt'] / total_count
            rec['percent_string'] = '{0:.1%}'.format(float_percent)
            rec['percent_number'] = float("%.3f" % (float_percent))

            # total count


        data_dict = OrderedDict()
        data_dict['record_count'] = len(formatted_records)
        data_dict['records'] = formatted_records

        return StatsResult.build_success_result(data_dict)
    def get_dataset_count_by_month(self,
        Return dataset counts by month
        # Was an error found earlier?
        if self.was_error_found():
            return self.get_error_msg_return()

        # -----------------------------------
        # (1) Build query filters
        # -----------------------------------

        # Exclude records where dates are null
        #   - e.g. a record may not have a publication date
        if date_param == DVOBJECT_CREATEDATE_ATTR:
            exclude_params = {}
            exclude_params = {'%s__isnull' % date_param: True}

        # Retrieve the date parameters
        filter_params = self.get_date_filter_params()

        # Add extra filters from kwargs
        if extra_filters:
            for k, v in extra_filters.items():
                filter_params[k] = v

        # -----------------------------------
        # (2) Construct query
        # -----------------------------------

        # add exclude filters date filters
        ds_counts_by_month = Dataset.objects.select_related('dvobject'\

        # annotate query adding "month_year" and "cnt"
        ds_counts_by_month = ds_counts_by_month.annotate(\
            yyyy_mm=TruncYearMonth('%s' % date_param)\
            ).values('yyyy_mm', 'count'\
            ).order_by('%syyyy_mm' % self.time_sort)

        # store query string
        sql_query = str(ds_counts_by_month.query)

        # -----------------------------------
        # (3) Format results
        # -----------------------------------
        # hold the running total count
        running_total = self.get_dataset_count_start_point(**extra_filters)
        formatted_records = []  # move from a queryset to a []

        for d in ds_counts_by_month:
            fmt_dict = OrderedDict()
            fmt_dict['yyyy_mm'] = d['yyyy_mm'].strftime('%Y-%m')
            fmt_dict['count'] = d['count']

            # running total
            running_total += d['count']
            fmt_dict['running_total'] = running_total
            # d['month_year'] = d['yyyy_mm'].strftime('%Y-%m')

            # Add year and month numbers
            fmt_dict['year_num'] = d['yyyy_mm'].year
            fmt_dict['month_num'] = d['yyyy_mm'].month

            # Add month name
            month_name_found, month_name_short = get_month_name_abbreviation(

            if month_name_found:
                assume_month_name_found, fmt_dict[
                    'month_name'] = get_month_name(d['yyyy_mm'].month)
                fmt_dict['month_name_short'] = month_name_short
                # Log it!!!!!!

            # Add formatted record

        data_dict = OrderedDict()
        data_dict['record_count'] = len(formatted_records)
        data_dict['records'] = formatted_records

        return StatsResult.build_success_result(data_dict, sql_query)
    def get_published_dataverses_without_content(self, **extra_filters):
        """For curation purposes: a list of all published dataverses that do
        not contain any datasets/content. A spreadsheet starting with the oldest
        dataverses is appreciated.  Based on @sekmiller's SQL query"""

        # Was an error found earlier?
        if self.was_error_found():
            return self.get_error_msg_return()

        # -----------------------------------
        # Retrieve the date parameters - distinguish by create date
        # -----------------------------------
        filter_params = self.get_date_filter_params()

        # -----------------------------------
        # Retrieve ids of Dataverses to ~exclude~
        # -----------------------------------

        # Get DvObject Ids of:
        #  - Dataverses that contain Datasets
        #  - Dataverses that have an owner
        id_set1 = DvObject.objects.filter(\
                        Q(dtype=DTYPE_DATASET) |\
                        Q(dtype=DTYPE_DATAVERSE, owner__isnull=False)
                        ).values_list('owner__id', flat=True)

        # Get DvObject Ids of:
        #  - Dataverses that link to datasets
        id_set2 = DatasetLinkingDataverse.objects.distinct('linkingdataverse__id'\
                    ).values_list('linkingdataverse__id', flat=True)

        # Get DvObject Ids of:
        #  - Dataverses that link to Dataverses
        id_set3 = DataverseLinkingDataverse.objects.distinct('dataverse__id'\
                    ).values_list('dataverse__id', flat=True)

        #  Combine the ids into a list
        dv_ids_to_exclude = set(
            list(id_set1) + list(id_set2) + list((id_set3)))

        #   Retrieve published Dataverses that aren't in the list above
        dv_info_list = Dataverse.objects.select_related('dvobject'\
                    ).values('dv_id', 'name', 'alias'\
                            , 'create_date', 'pub_date'\
                            , 'affiliation'\
                    ).order_by('create_date', 'name')

        sql_query = str(q.query)

        records = []
        for dv_info in dv_info_list:
            single_rec = OrderedDict()
            single_rec['id'] = dv_info['dv_id']
            single_rec['name'] = dv_info['name']
            single_rec['alias'] = dv_info['alias']
            single_rec['url'] = '%s/dataverse/%s' % (
                settings.DATAVERSE_INSTALLATION_URL, dv_info['alias'])
            #single_rec['description'] = dv_info['description']
            single_rec['affiliation'] = dv_info['affiliation']
            single_rec['publication_date'] = dv_info['pub_date'].strftime(
            single_rec['create_date'] = dv_info['create_date'].strftime(

        data_dict = OrderedDict()
        data_dict['count'] = len(records)
        data_dict['records'] = records

        return StatsResult.build_success_result(data_dict, sql_query)
    def get_file_counts_per_dataset_latest_versions(self, **extra_filters):
        Get binning stats for the number of files in each Dataset.
        For the counts, only use the LATEST DatasetVersion

        # Get the correct DatasetVersion ids as a filter parameter
        latest_dsv_ids = self.get_dataset_version_ids(**extra_filters)
        filter_params = dict(datasetversion__id__in=latest_dsv_ids)

        # Make query
        ds_version_counts = FileMetadata.objects.filter(**filter_params\
                            ).values('dsv_id', 'cnt'\

        # Convert to Dataframe
        df = pd.DataFrame(list(ds_version_counts), columns=['dsv_id', 'cnt'])

        # Get the list of bins
        high_num = high_num = df['cnt'].max() + self.bin_size
        bins = self.get_bin_list(step=self.bin_size,
                                 high_num=high_num + self.bin_size)

        # Add a new column, assigning each file count to a bin
        df['bin_label'] = pd.cut(df['cnt'], bins)

        # Count the occurrence of each bin
        bin_count_series = pd.value_counts(df['bin_label'])

        # Make the Series into a new DataFrame
        df_bins = pd.DataFrame(dict(bin=bin_count_series.index,\

        # Add a sort key
        # (0, 20] -> 0
        # (20, 30] -> 20
        # etc
        df_bins['sort_key'] = df_bins['bin'].apply(
            lambda x: int(x[1:-1].split(',')[0]))
        df_bins['bin_start_inclusive'] = df_bins['sort_key']
        df_bins['bin_end'] = df_bins['bin'].apply(
            lambda x: int(x[1:-1].split(',')[1]))

        # Add a formatted string
        # (0, 20] -> 0 to 20
        # (20, 30] -> 20 to 30
        # etc
        df_bins['bin_str'] = df_bins['bin'].apply(
            lambda x: x[1:-1].replace(', ', ' to '))

        # Sort the bins
        df_bins = df_bins.sort('sort_key')


        # If appropriate, skip empty bins, e.g. remove 0 counts
        if self.skip_empty_bins:
            df_bins = df_bins.query('count != 0')

        # Return as python dict
        #   # bit expensive but want orderedDict
        formatted_records_json = df_bins.to_json(orient='records')
        formatted_records = json.loads(formatted_records_json,

        data_dict = OrderedDict()
        data_dict['record_count'] = len(formatted_records)
        data_dict['records'] = formatted_records

        return StatsResult.build_success_result(data_dict)
    def get_dataset_size_counts(self, **extra_filters):
        Get binning stats for the byte size of each Dataset.

        # Get the correct DatasetVersion ids as a filter parameter
        filter_params = {}
        if extra_filters:
        # Make query
        dataset_file_sizes = Datafile.objects.filter(**filter_params\
                                , ds_size=Sum('filesize')
                            ).values('ds_id', 'cnt', 'ds_size'\

        #total_bytes_used_result = Datafile.objects.filter(**filter_params\
        #                    ).aggregate(ds_size=Sum('filesize'))

        # Convert to Dataframe
        df = pd.DataFrame(list(dataset_file_sizes), columns = ['ds_id', 'cnt', 'ds_size'])

        #total_dataset_count = len(df.index) # * includes rows with missing values
        total_bytes_used = df['ds_size'].sum()

        # Get the list of bins
        high_num = df['ds_size'].max() + self.bin_size_bytes

        bins = self.get_bin_list(step=self.bin_size_bytes, low_num=0, high_num=high_num+self.bin_size_bytes)

        # Add a new column, assigning each file count to a bin
        df['bin_label'] = pd.cut(df['ds_size'], bins)

        # Count the occurrence of each bin
        bin_count_series = pd.value_counts(df['bin_label'])

        # Make the Series into a new DataFrame
        df_bins = pd.DataFrame(dict(bin=bin_count_series.index,\

        total_dataset_count = df_bins['count'].sum()

        # Add a sort key
        # (0, 20] -> 0
        # (20, 30] -> 20
        # etc
        df_bins['sort_key'] = df_bins['bin'].apply(lambda x: int(x[1:-1].split(',')[0]))

        if total_dataset_count > 0:
            df_bins['percentage_of_datasets'] = df_bins['count'].apply(lambda x: "{0:.4f}%".format(100 * x/float(total_dataset_count)))

        df_bins['bin_start_inclusive'] = df_bins['sort_key']
        df_bins['bin_start_inclusive_commas'] = df_bins['bin_start_inclusive'].apply(lambda x: comma_sep_number(x))
        df_bins['bin_start_inclusive_abbrev'] = df_bins['bin_start_inclusive'].apply(lambda x: sizeof_fmt(x))

        df_bins['bin_end'] = df_bins['bin'].apply(lambda x: int(x[1:-1].split(',')[1]))
        df_bins['bin_end_commas'] = df_bins['bin_end'].apply(lambda x: comma_sep_number(x))
        df_bins['bin_end_abbrev'] = df_bins['bin_end'].apply(lambda x: sizeof_fmt(x))

        df_bins['bin_str'] = df_bins['bin_start_inclusive_abbrev'].str.cat(df_bins['bin_end_abbrev'].values.astype(str), sep=' to ')

        # Sort the bins
        df_bins = df_bins.sort('sort_key')


        # If appropriate, skip empty bins, e.g. remove 0 counts
        if self.skip_empty_bins:
            df_bins = df_bins.query('count != 0')

        # Return as python dict
        #   # bit expensive but want orderedDict
        formatted_records_json = df_bins.to_json(orient='records')
        formatted_records = json.loads(formatted_records_json, object_pairs_hook=OrderedDict)

        data_dict = OrderedDict()
        data_dict['record_count'] = len(formatted_records)
        data_dict['dataset_count'] = total_dataset_count
        data_dict['total_bytes_used'] = total_bytes_used
        data_dict['total_bytes_used_comma'] = comma_sep_number(int(total_bytes_used))
        data_dict['total_bytes_used_abbrev'] = sizeof_fmt(total_bytes_used)
        data_dict['records'] = formatted_records

        return StatsResult.build_success_result(data_dict)
    def get_dataverse_counts_by_type(self,
        Return dataverse counts by 'dataversetype'

        Optional if a dataverse is uncategorized:
            - Specifying 'uncategorized_replacement_name' will
                set "UNCATEGORIZED" to another string

        Returns: { "dv_counts_by_type": [
                            "dataversetype": "RESEARCH_PROJECTS",
                            "type_count": 85,
                            "total_count": 356,
                            "percent_string": "23.9%"
                            "dataversetype": "TEACHING_COURSES",
                            "type_count": 10,
                            "total_count": 356,
                            "percent_string": "2.8%"
                            ... etc
        if self.was_error_found():
            return self.get_error_msg_return()

        # Retrieve the date parameters
        filter_params = self.get_date_filter_params(DVOBJECT_CREATEDATE_ATTR)

        # Add extra filters
        if extra_filters:
            for k, v in extra_filters.items():
                filter_params[k] = v

        if exclude_uncategorized:
            exclude_params = dict(dataversetype=DATAVERSE_TYPE_UNCATEGORIZED)
            exclude_params = {}

        dataverse_counts_by_type = Dataverse.objects.select_related('dvobject'\

        # -----------------------------------
        # Get SQL query string
        # -----------------------------------
        sql_query = str(dataverse_counts_by_type.query)

        # Count all dataverses
        total_count = sum(
            [rec.get('type_count', 0) for rec in dataverse_counts_by_type])
        total_count = total_count + 0.0

        # Format the records, adding 'total_count' and 'percent_string' to each one
        formatted_records = []
        for rec in dataverse_counts_by_type:
            fmt_dict = OrderedDict()
            fmt_dict['dataversetype'] = rec['dataversetype']
            fmt_dict['dataversetype_label'] = rec['dataversetype'].replace(
                '_', ' ')
            fmt_dict['type_count'] = rec.get('type_count', 0)

            if total_count > 0:
                float_percent = rec.get('type_count', 0) / total_count
                fmt_dict['total_count'] = int(total_count)
                fmt_dict['percent_string'] = '{0:.1%}'.format(float_percent)
                fmt_dict['total_count'] = 0
                fmt_dict['percent_string'] = '0%'


        data_dict = OrderedDict()
        data_dict['record_count'] = len(formatted_records)
        data_dict['records'] = formatted_records

        return StatsResult.build_success_result(data_dict, sql_query)