示例#1
0
    def __call__(self, dataframe, path ):
        '''iterate over leaves/branches in data structure.

        This method will call the :meth:`render` method for 
        each leaf/branch at level :attr:`nlevels`.
        '''
        if self.nlevels == None: raise NotImplementedError("incomplete implementation of %s" % str(self))

        try:
            labels = dataframe.index.levels
            paths = dataframe.index.unique()
        except AttributeError:
            labels = ['dummy1'] 
            paths = ['dummy1']

        result = ResultBlocks()

        #print len(labels), self.nlevels
        #print 'dataframe=', dataframe

        if self.nlevels != -1 and len(labels) != self.nlevels:
            raise ValueError( "at path %s: expected %i levels - got %i: %s" %\
                                  (str(path), self.nlevels, 
                                   len(labels), str(labels)) )
            #result.append( EmptyResultBlock( title = path2str(path) ) )
            #return result

        if not self.split_at:
            # print without splitting
            result.extend( self.render( dataframe, path ) )
        else:
            # split dataframe at first index
            first_level_labels = dataframe.index.get_level_values(0).unique()
            if len(first_level_labels) < self.split_at:
                result.extend( self.render( dataframe, path ) )
            else:
                # select tracks to always add to split 
                # pick always tracks
                if self.split_always:
                    always = [ x for x, y in itertools.product( first_level_labels, self.split_always) \
                                   if re.search( y, x ) ]
                else:
                    always = []

                for z, x in enumerate(range( 0, len(first_level_labels), self.split_at)) :
                    select = list(DataTree.unique( always + list(first_level_labels[x:x+self.split_at]) ))

                    if len(dataframe.index.names) == 1:
                        # if only one level, use loc to obtain dataframe
                        # index is duplicated, so ignore second level
                        work = pandas.concat( [dataframe.loc[[s]] for s in select], keys = select )
                        work.reset_index( range( 1, len(work.index.names)), drop=True, inplace=True )
                    else:
                        work = pandas.concat( [dataframe.xs(s, axis=0) for s in select], keys = select )
                        
                    # reconcile index names
                    work.index.names = dataframe.index.names
                    result.extend( self.render( work, path + (z, ) ) )
                    
        return result
示例#2
0
    def render( self ):
        '''supply the :class:`Renderer.Renderer` with the data to render. 
        
        The data supplied will depend on the ``groupby`` option.

        returns a ResultBlocks data structure.
        '''
        self.debug( "%s: rendering data started for %i items" % (self,
                                                                 len(self.data)))

        # get number of levels required by renderer
        try:
            renderer_nlevels = self.renderer.nlevels
        except AttributeError:
            # set to -1 to avoid any grouping
            # important for user renderers that are functions
            # and have no level attribute.
            renderer_nlevels = -1

        # initiate output structure
        results = ResultBlocks( title = "")

        # convert to data series
        # The data is melted, i.e,
        # BMW    price    10000
        # BMW    speed    100
        # Golf   price    5000
        # Golf   speed    50  
        dataframe = DataTree.asDataFrame( self.data )
        # dataframe.write_csv( "test.csv" )
        
        if dataframe is None:
            self.warn( "%s: no data after conversion" % self )
            raise ValueError( "no data for renderer" )            

        # special patch: set column names to pruned levels
        # if there are no column names
        if len(dataframe.columns) == len(self.pruned):
            if list(dataframe.columns) == list(range( len(dataframe.columns))):
                dataframe.columns = [x[1] for x in self.pruned]
        
        index = dataframe.index

        def getIndexLevels( index ):
            try:
                # hierarchical index
                nlevels = len(index.levels)
            except AttributeError:
                nlevels = 1
                index = [ (x,) for x in index]
            #raise ValueError('data frame without MultiIndex' )
            return nlevels

        nlevels = getIndexLevels( index )

        self.debug( "%s: rendering data started. levels=%i, required levels>=%i, group_level=%s" %\
                        (self, nlevels, 
                         renderer_nlevels,
                         str(self.group_level) ) )

        if renderer_nlevels < 0 and self.group_level <= 0:
            # no grouping for renderers that will accept
            # a dataframe with any level of indices and no explicit
            # grouping has been asked for.
            results.append( self.renderer( dataframe, path = () ) )
        else:
            # user specified group level by default
            group_level = self.group_level

            # set group level to maximum allowed by renderer
            if renderer_nlevels >= 0:
                group_level = max(nlevels - renderer_nlevels, group_level)
                
            # add additional level if necessary
            if nlevels < group_level:
                prefix = tuple(["level%i" % x for x in range( group_level - nlevels)])
                dataframe.index = pandas.MultiIndex.from_tuples( [ prefix + x for x in dataframe.index ] )

            # used to be: group_level + 1
            # hierarchical index
            # numpy.unique converts everything to a string
            # which is not consistent with selecting later
            paths = map( tuple, DataTree.unique( [ x[:group_level] for x in dataframe.index.unique() ] ))

            pathlength = len(paths[0]) - 1

            is_hierarchical = isinstance( dataframe.index, pandas.core.index.MultiIndex )

            if is_hierarchical:
                # Note: can only sort hierarchical indices
                dataframe = dataframe.sortlevel()

                if dataframe.index.lexsort_depth < pathlength:
                    raise ValueError('could not sort data frame: sort depth=%i < pathlength=%i, dataframe=%s' \
                                         % (dataframe.index.lexsort_depth, 
                                            pathlength,
                                            dataframe))
            
            for path in paths:

                if path:
                    if len(path) == nlevels:
                        # extract with loc in order to obtain dataframe
                        work = dataframe.loc[[path]]
                    else:
                        # select data frame as cross-section
                        work = dataframe.xs(path, axis=0 )
                else:
                    # empty tuple - use full data set
                    work = dataframe
                    
                # remove columns and rows in work that are all Na
                work = work.dropna( axis=1, how='all').dropna( axis=0, how='all')
                
                if is_hierarchical and renderer_nlevels >= 0:
                    work_levels = getIndexLevels( work.index )
                    # reduce levels of indices required to that required
                    # for Renderer. This occurs if groupby=none.
                    if work_levels > renderer_nlevels:
                        sep = work_levels - (renderer_nlevels - 1)
                        tuples = [ ( DataTree.path2str( x[:sep] ), ) + x[sep:] \
                                       for x in work.index ]
                        work.index = pandas.MultiIndex.from_tuples( tuples )

                try:
                    results.append( self.renderer( work,
                                                   path = path ))
                except:
                    self.error( "%s: exception in rendering" % self )
                    results.append( ResultBlocks( Utils.buildException( "rendering" ) ) )

        if len(results) == 0:
            self.warn("renderer returned no data.")
            raise ValueError( "renderer returned no data." )

        self.debug( "%s: rendering data finished with %i blocks" % (self.tracker, len(results)))

        return results
示例#3
0
    def asSpreadSheet( self, dataframe, row_headers, col_headers, title ):
        '''save the table as an xls file.

        Multiple files of the same Renderer/Tracker combination are distinguished 
        by the title.
        '''
        
        self.debug("%s: saving %i x %i table as spread-sheet'"% (id(self), 
                                                                 len(row_headers), 
                                                                 len(col_headers)))
        quick = len(dataframe) > 10000
        
        if quick:
            # quick writing, only append method works
            wb = openpyxl.Workbook( optimized_write = True)
            def addWorksheet( wb, dataframe, title ):
                ws = wb.create_sheet()

                ws.append( [""] + list(col_headers) )
                for x,row in enumerate( dataframe.iterrows() ):
                    ws.append( [path2str(row_headers[x])] + list(row) )

                # patch: maximum title length seems to be 31
                ws.title = title[:30]
                    
        else:
            # do it cell-by-cell, this might be slow
            wb = openpyxl.Workbook( optimized_write = False)
            def addWorksheet( wb, dataframe, title ):
                ws = wb.create_sheet()

                # regex to detect rst hypelinks
                regex_link = re.compile( '`(.*) <(.*)>`_')
                for column, column_name in enumerate( dataframe.columns ):
                    c = ws.cell( row=0, column=column)
                    c.value = column_name
                    dataseries = dataframe[column_name]
                    if dataseries.dtype == object:
                        for row, value in enumerate( dataseries ):
                            c = ws.cell( row=row+1, column=column)
                            value = str(value)
                            if value.startswith('`'):
                                c.value, c.hyperlink = regex_link.match( value ).groups()
                            else:
                                c.value = value 
                    else:
                        for row, value in enumerate( dataseries ):
                            c = ws.cell( row=row+1, column=column)
                            c.value = value 
                # patch: maximum title length seems to be 31
                ws.title = title[:30]

        is_hierarchical = isinstance( dataframe.index, pandas.core.index.MultiIndex )

        split = is_hierarchical and len(dataframe.index.levels) > 1

        if split:
            # create separate worksheets for nested indices
            nlevels = len(dataframe.index.levels)
            paths = map( tuple, DataTree.unique( [ x[:nlevels-1] for x in dataframe.index.unique() ] ))

            ws = wb.worksheets[0]
            ws.title = 'Summary' 
            ws.append( [dataframe.index.labels[:nlevels-1]] + ["Worksheet", "Rows" ] )

            for row, path in enumerate(paths):
                # select data frame as cross-section
                work = dataframe.xs(path, axis=0 )
                title = path2str( path )[:30]
                ws.append( list(path) + [title, len(work)] )
                c = ws.cell( row = row+1, column = nlevels )
                c.hyperlink = "#%s" % title
                addWorksheet( wb, work, title = title )
                
        else:
            writeWorksheet( wb, dataframe, title = title )

        # write result block 
        lines = []
        lines.append("`%i x %i table <#$xls %s$#>`__" %\
                     (len(row_headers), len(col_headers),
                      title) )
        lines.append( "" )
        
        r = ResultBlock( "\n".join(lines), title = title)
        r.xls = wb

        self.debug("%s: saved %i x %i table as spread-sheet'"% (id(self), 
                                                                len(row_headers), 
                                                                len(col_headers)))
        return r