예제 #1
0
 def aggregate(self, aggregations={}, parentKey=AttributeDict()):
     if not aggregations:
         return HierarchyLeaf()
     return HierarchyLeaf([
         AttributeDict((field, aggMethod(parentKey, self))
                       for field, aggMethod in aggregations.items())
     ])
예제 #2
0
    def join(self,
             other,
             joinParams=None,
             otherFieldPrefix='',
             joinType=JoinType.LEFT_OUTER_JOIN):
        '''
dataTable.join(otherTable, joinParams, otherFieldPrefix='')
	returns a new table with rows in the first table joined with rows in the second table, using joinParams to map fields in the first to fields in the second
Parameters:
	other - the table to join
	joinParams - a dictionary of <field in self> to <field in other>. Defaults to "natural join", merging common headers
	otherFieldPrefix - a string to prepend to the fields added from the second table
	joinType - the instance of JoinType which indicates if items should be included in one data table which aren't in the other
		'''
        if joinParams is None:
            joinParams = {h: h for h in self.headers() if h in other.headers()}
        elif not isinstance(joinParams, dict):
            raise Exception(
                "joinParams must be a dictionary of <field in self> to <field in other>"
            )
        selfJoinHeaders = list(joinParams.values())
        otherJoinHeaders = [joinParams[h] for h in selfJoinHeaders]

        newOtherHeaders = {
            (v if v in joinParams.values() else otherFieldPrefix + v)
            for v in otherJoinHeaders
        }
        otherBuckets = other.extend(
            lambda row: {
                otherFieldPrefix + v: row[v]
                for v in other.headers() if v not in otherJoinHeaders
            }).project(newOtherHeaders).bucket(*otherJoinHeaders)
        emptyOtherRow = AttributeDict({
            otherFieldPrefix + v: None
            for v in other.headers() if v not in otherJoinHeaders
        })
        emptySelfRow = AttributeDict({
            header: None
            for header in self.headers() if header not in selfJoinHeaders
        })
        otherKeysSeen = set()

        def it():
            for row in self:
                rowKey = tuple(row[selfHeader]
                               for selfHeader, otherHeader in joinParams)
                otherKeysSeen.add(rowKey)
                if rowKey in otherBuckets:
                    for otherRow in otherBuckets[rowKey]:
                        yield row + otherRow
                elif joinType.leftOuter:
                    yield emptyOtherRow + row
            if joinType.rightOuter:
                for otherKey, otherBucket in otherBuckets.items():
                    if otherKey not in otherKeysSeen:
                        for row in otherBucket:
                            yield emptySelfRow + row

        return DataTableStream(it(),
                               set(self.headers()).union(newOtherHeaders))
예제 #3
0
 def originalToRows(self, keyFields):
     return [
         AttributeDict({
             field: toRow[fieldIdx]
             for field, fieldIdx in self.diffFields.items()
         }) + dict(zip(keyFields, self.key)) for toRow in (self.toRow or [])
     ]
예제 #4
0
    def aggregate(self, groupBy, aggregations={}):
        '''return an aggregation of the data grouped by a given set of fields.
Parameters:
	groupBy - the set of fields to group
	aggregations - a dict of field name -> aggregate method, where the method takes an intermediate DataTable
		and returns the value for that field for that row.
		'''
        if not aggregations:
            return self.project(groupBy).distinct()
        accumulatedRows = {}
        for row in self:
            key = tuple(row[field] for field in groupBy)
            if key not in accumulatedRows:
                accumulatedRows[key] = {
                    a: agg.newBucket(row)
                    for a, agg in aggregations.items()
                }
            accRow = accumulatedRows[key]
            for a, agg in aggregations.items():
                accRow[a] = agg.addRow(row, accRow[a])
        newData = []
        for key, accRow in sorted(accumulatedRows.items()):
            newData.append(
                AttributeDict(zip(groupBy, key)) + {
                    a: agg.finalize(accRow[a])
                    for a, agg in aggregations.items()
                })
        return DataTable(newData)
예제 #5
0
 def bucket(self, *fields):
     '''Returns a dict of bucket -> DataTable of rows matching that bucket'''
     buckets = defaultdict(lambda: [])
     for data in self.__data:
         key = tuple(data[field] for field in fields)
         buckets[key].append(data)
     return AttributeDict(
         (key, DataTable(bucket)) for key, bucket in buckets.items())
예제 #6
0
 def renameColumn(self, column, newName):
     '''rename the column in place'''
     swap = lambda h: h if h != column else newName
     transform = lambda row: AttributeDict(
         (swap(k), v) for k, v in row.items())
     return self.transform(transform,
                           {swap(header)
                            for header in self.__headers})
예제 #7
0
    def __init__(self, data=None, parseMethod=None):
        '''Create a data table from the given data
	data may be one of the following:
A sequence of dictionaries, where all of the dictionaries share common keys
A sequence of sequences where the first item is the list of headers
Another DataTable instance, which will create a deep copy
A string which may be parsed into one of the previous by calling parseMethod on the string.
'''
        if isinstance(data, DataTable):
            self.__headers = {
                h: DataColumn(self, c)
                for h, c in data.__headers.items()
            }
            self.__data = [
                AttributeDict((h.header, row[h.header])
                              for h in self.__headers.values()) for row in data
            ]
            return
        if isinstance(data, str):
            data = parseMethod(data)
        if not data:
            self.__data = []
            self.__headers = {}
            return
        data = [row for row in data]
        if not data:
            self.__data = []
            self.__headers = {}
            return
        if isinstance(data[0], dict):
            headers = {k for row in data for k in row.keys()}
            self.__headers = {h: DataColumn(self, h) for h in headers}
            for row in data:
                for header in self.__headers.keys():
                    if header not in row:
                        row[header] = None
            self.__data = [AttributeDict(row) for row in data]
        else:
            headers = data.pop(0)
            self.__headers = {h: DataColumn(self, h) for h in headers}
            self.__data = [AttributeDict(zip(headers, row)) for row in data]
예제 #8
0
 def addValues(self, values):
     '''Adds the values from the values dict to this hierarchy'''
     val = values[self.keyHeaders[0]]
     if len(self.keyHeaders) == 1:
         if val not in self._data:
             self._data[val] = HierarchyLeaf()
         self._data[val].append(
             AttributeDict((k, values[k]) for k in self.leafHeaders))
         return
     if val not in self._data:
         self._data[val] = Hierarchy(self.keyHeaders[1:], self.leafHeaders)
     self[val].addValues(values)
예제 #9
0
    def checkRemove_multiField(self, filterMethod, *fields):
        '''
		remove the set of fields from the result if filterMethod returns true for those entries
filterMethod is a method which takes two dicts: fromRow and toRow, with those fields specified by the fields parameter and returns if those values can be removed from the result
fields is a list of fields to check and possibly remove
		'''
        fieldIdxs = tuple((field, self.diffFields[field]) for field in fields)
        if any(fieldIdx not in self.__data for field, fieldIdx in fieldIdxs):
            return
        fromRow, toRow = (AttributeDict((field, self.__data[fieldIdx][i])
                                        for field, fieldIdx in fieldIdxs)
                          for i in (0, 1))
        if filterMethod(fromRow, toRow):
            for field, fieldIdx in fieldIdxs:
                del self.__data[fieldIdx]
예제 #10
0
    def aggregate(self, aggregations={}, parentKey=AttributeDict()):
        '''return an aggregation of the hiararchy leaf tables
	the resulting Hierarchy will have the same structure, except that the leaf tables will be collapsed to single rows
	containing the results of applying the aggregations to the original leaf tables
Parameters:
	aggregations - a dict of field name -> aggregate method, where the method takes an intermediate HierarchyLeaf
		and returns the value for that field for that row.
		'''
        if not aggregations:
            return self.reindex(self.keyHeaders, ())
        new = Hierarchy(self.keyHeaders, aggregations.keys())
        for key, child in self:
            new._data[key] = child.aggregate(aggregations,
                                             parentKey=parentKey +
                                             {self.keyHeaders[0]: key})
        return new
예제 #11
0
 def tempIterRows():
     accumulatedRows = {}
     for row in self:
         key = tuple(row[field] for field in groupBy)
         if key not in accumulatedRows:
             accumulatedRows[key] = {
                 a: agg.newBucket(row)
                 for a, agg in aggregations.items()
             }
         accRow = accumulatedRows[key]
         for a, agg in aggregations.items():
             accRow[a] = agg.addRow(row, accRow[a])
     for key, accRow in sorted(accumulatedRows.items()):
         yield AttributeDict(zip(groupBy, key)) + {
             a: agg.finalize(accRow[a])
             for a, agg in aggregations.items()
         }
예제 #12
0
 def tempIterRows():
     for column in self.columns():
         row = AttributeDict(zip(rowIDs, column))
         row['Field'] = column.header
         yield row
예제 #13
0
 def project(self, newLeafHeaders):
     return HierarchyLeaf(
         AttributeDict((k, v) for k, v in row.items()
                       if k in newLeafHeaders) for row in self)
예제 #14
0
 def copy(self):
     return HierarchyLeaf(AttributeDict(row) for row in self)
예제 #15
0
def parseCsv(f, headers=None, sep=',', quot='"'):
    return DataTable(
        AttributeDict(line) for line in csv.DictReader(
            f, fieldnames=headers, delimiter=sep, quotechar=quot))
예제 #16
0
 def asDict(self):
     return AttributeDict(iter(self))
예제 #17
0
 def parse():
     for line in f:
         d = AttributeDict()
         for header in headers:
             d[header[0]] = line[header[1]:header[2]]
         yield d