Exemplo n.º 1
0
    def read_csv(self, name, use_whole_file=False, names=None, skiprows=0,
                 *args, **kwargs):
        """Read a CSV file in and parse it into Pandas DataFrames.
        If no names is provided we use the first row for the names.
        header=0 is the default unless names is provided in which case
        header=None is the default.
        skiprows indicates how many rows of input to skip. This will
        only be applied to the first partition of the data (so if
        #skiprows > #row in first partition this will not work). Generally
        this shouldn't be an issue for small values of skiprows.
        No other values of header is supported.
        All additional parameters are passed to the read_csv function.
        """
        def csv_file(partitionNumber, files):
            file_count = 0
            for filename, contents in files:
                # Only skip lines on the first file
                if partitionNumber == 0 and file_count == 0 and _skiprows > 0:
                    yield pandas.read_csv(StringIO(contents), *args,
                                          header=None,
                                          names=mynames,
                                          skiprows=_skiprows, **kwargs)
                else:
                    file_count += 1
                    yield pandas.read_csv(StringIO(contents), *args,
                                          header=None,
                                          names=mynames,
                                          **kwargs)

        def csv_rows(partitionNumber, rows):
            rowCount = 0
            inputStr = "\n".join(rows)
            if partitionNumber == 0:
                return pandas.read_csv(StringIO(row), *args, header=None,
                                       names=mynames, skiprows=_skiprows,
                                       **kwargs)
            else:
                return pandas.read_csv(StringIO(row), *args, header=None,
                                       names=mynames, **kwargs)

        # If we need to peak at the first partition and determine the column
        # names
        mynames = None
        _skiprows = skiprows
        if names:
            mynames = names
        else:
            # In the future we could avoid this expensive call.
            first_line = self.sc.textFile(name).first()
            frame = pandas.read_csv(StringIO(first_line))
            mynames = list(frame.columns.values)
            _skiprows += 1

        # Do the actual load
        if use_whole_file:
            return PRDD.fromRDD(
                self.sc.wholeTextFiles(name).mapPartitionsWithIndex(csv_file))
        else:
            return PRDD.fromRDD(
                self.sc.textFile(name).mapPartitionsWithIndex(csv_rows))
Exemplo n.º 2
0
    def DataFrame(self, elements, *args, **kwargs):
        """Wraps the pandas.DataFrame operation."""
        def _load_partitions(partition):
            """Convert partitions of tuples."""
            partitionList = list(partition)
            if len(partitionList) > 0:
                (indices, elements) = zip(*partitionList)
                return iter([
                    pandas.DataFrame(data=list(elements),
                                     index=list(indices),
                                     *args,
                                     **kwargs)
                ])
            else:
                return iter([])

        # Zip with the index so we have consistent indexing as if it was
        # operated on locally
        index = range(len(elements))
        # TODO(holden): test this issue #13
        if 'index' in kwargs:
            index = kwargs['index']
        elementsWithIndex = zip(index, elements)
        return PRDD.fromRDD(
            self.sc.parallelize(elementsWithIndex).mapPartitions(
                _load_partitions))
Exemplo n.º 3
0
 def aggregate(self, f):
     """Apply the aggregation function.
     Note: This implementation does note take advantage of partial
     aggregation.
     """
     return PRDD.fromRDD(
         self._regroup_mergedRDD().values().map(lambda g: g.aggregate(f)))
Exemplo n.º 4
0
    def median(self):
        """Compute median of groups, excluding missing values.

        For multiple groupings, the result index will be a MultiIndex.
        """
        return PRDD.fromRDD(
            self._regroup_mergedRDD().values().map(lambda x: x.median()))
Exemplo n.º 5
0
 def DataFrame(self, elements, *args, **kwargs):
     """Wraps the pandas.DataFrame operation."""
     def _load_partitions(partition):
         """Convert partitions of tuples."""
         partitionList = list(partition)
         if len(partitionList) > 0:
             (indices, elements) = zip(*partitionList)
             return iter([
                 pandas.DataFrame(
                     data=list(elements),
                     index=list(indices),
                     *args,
                     **kwargs)])
         else:
             return iter([])
     # Zip with the index so we have consistent indexing as if it was
     # operated on locally
     index = range(len(elements))
     # TODO(holden): test this issue #13
     if 'index' in kwargs:
         index = kwargs['index']
     elementsWithIndex = zip(index, elements)
     return PRDD.fromRDD(
         self.sc.parallelize(elementsWithIndex).mapPartitions(
             _load_partitions))
Exemplo n.º 6
0
 def aggregate(self, f):
     """Apply the aggregation function.
     Note: This implementation does note take advantage of partial
     aggregation.
     """
     return PRDD.fromRDD(
         self._regroup_mergedRDD().values().map(
             lambda g: g.aggregate(f)))
Exemplo n.º 7
0
    def median(self):
        """Compute median of groups, excluding missing values.

        For multiple groupings, the result index will be a MultiIndex.
        """
        return PRDD.fromRDD(
            self._regroup_mergedRDD().values().map(
                lambda x: x.median()))
Exemplo n.º 8
0
 def nth(self, n, *args, **kwargs):
     """Take the nth element of each grouby."""
     # TODO: Stop collecting the entire frame for each key.
     myargs = self._myargs
     mykwargs = self._mykwargs
     nthRDD = self._regroup_mergedRDD().mapValues(
         lambda r: r.nth(n, *args, **kwargs)).values()
     return PRDD.fromRDD(nthRDD)
Exemplo n.º 9
0
    def mean(self):
        """Compute mean of groups, excluding missing values.

        For multiple groupings, the result index will be a MultiIndex.
        """
        # TODO(holden): use stats counter
        return PRDD.fromRDD(
            self._regroup_mergedRDD().values().map(lambda x: x.mean()))
Exemplo n.º 10
0
    def var(self, ddof=1):
        """Compute standard deviation of groups, excluding missing values.

        For multiple groupings, the result index will be a MultiIndex.
        """
        # TODO(holden): use stats counter
        return PRDD.fromRDD(
            self._regroup_mergedRDD().values().map(lambda x: x.var(ddof=ddof)))
Exemplo n.º 11
0
 def nth(self, n, *args, **kwargs):
     """Take the nth element of each grouby."""
     # TODO: Stop collecting the entire frame for each key.
     myargs = self._myargs
     mykwargs = self._mykwargs
     nthRDD = self._regroup_mergedRDD().mapValues(
         lambda r: r.nth(
             n, *args, **kwargs)).values()
     return PRDD.fromRDD(nthRDD)
Exemplo n.º 12
0
    def mean(self):
        """Compute mean of groups, excluding missing values.

        For multiple groupings, the result index will be a MultiIndex.
        """
        # TODO(holden): use stats counter
        return PRDD.fromRDD(
            self._regroup_mergedRDD().values().map(
                lambda x: x.mean()))
Exemplo n.º 13
0
    def var(self, ddof=1):
        """Compute standard deviation of groups, excluding missing values.

        For multiple groupings, the result index will be a MultiIndex.
        """
        # TODO(holden): use stats counter
        return PRDD.fromRDD(
            self._regroup_mergedRDD().values().map(
                lambda x: x.var(
                    ddof=ddof)))
Exemplo n.º 14
0
    def from_schema_rdd(self, schemaRDD):
        """Convert a schema RDD to a L{PRDD}."""
        def _load_kv_partitions(partition):
            """Convert a partition where each row is key/value data."""
            partitionList = list(partition)
            if len(partitionList) > 0:
                return iter([pandas.DataFrame(data=partitionList)])
            else:
                return iter([])

        return PRDD.fromRDD(schemaRDD.mapPartitions(_load_kv_partitions))
Exemplo n.º 15
0
 def from_schema_rdd(self, schemaRDD):
     """Convert a schema RDD to a L{PRDD}."""
     def _load_kv_partitions(partition):
         """Convert a partition where each row is key/value data."""
         partitionList = list(partition)
         if len(partitionList) > 0:
             return iter([
                 pandas.DataFrame(data=partitionList)
             ])
         else:
             return iter([])
     return PRDD.fromRDD(schemaRDD.mapPartitions(_load_kv_partitions))
Exemplo n.º 16
0
    def read_json(self, name,
                  *args, **kwargs):
        """Read a json file in and parse it into Pandas DataFrames.
        If no names is provided we use the first row for the names.
        Currently, it is not possible to skip the first n rows of a file.
        Headers are provided in the json file and not specified separately.
        """
        def json_file(partitionNumber, files):
            for filename, contents in files:
                yield pandas.read_json(sio(contents), *args, **kwargs)

        return PRDD.fromRDD(
            self.sc.wholeTextFiles(name).mapPartitionsWithIndex(json_file))
Exemplo n.º 17
0
    def csvfile(self, name, use_whole_file=True, *args, **kwargs):
        """
        Read a CSV file in and parse it into panda data frames. Note this uses
        wholeTextFiles by default underneath the hood so as to support
        multi-line CSV records so many small input files are preferred.
        All additional parameters are passed to the read_csv function
        """
        # TODO(holden): string IO stuff

        def csv_file(contents, *args, **kwargs):
            return pandas.read_csv(StringIO(contents), *args, header=0,
                                   **kwargs)

        def csv_rows(rows, *args, **kwargs):
            for row in rows:
                yield pandas.read_csv(StringIO(row), *args, header=0, **kwargs)

        if use_whole_file:
            return PRDD.fromRDD(self.sc.wholeTextFiles(name).map(
                lambda (name, contents): csv_file(contents, *args, **kwargs)))
        else:
            return PRDD.fromRDD(self.sc.textFile(name).mapPartitions(
                lambda x: csv_rows(x, *args, **kwargs)))
Exemplo n.º 18
0
    def max(self):
        """Compute the max for each group."""
        myargs = self._myargs
        mykwargs = self._mykwargs

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).max()

        def merge_value(x, y):
            return x.append(create_combiner(y)).max()

        def merge_combiner(x, y):
            return x.append(y).max(level=0)

        rddOfMax = self._sortIfNeeded(
            self._distributedRDD.combineByKey(create_combiner, merge_value,
                                              merge_combiner)).values()
        return PRDD.fromRDD(rddOfMax)
Exemplo n.º 19
0
    def sum(self):
        """Compute the sum for each group."""
        myargs = self._myargs
        mykwargs = self._mykwargs

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).sum()

        def merge_value(x, y):
            return pandas.concat([x, create_combiner(y)])

        def merge_combiner(x, y):
            return x + y

        rddOfSum = self._sortIfNeeded(
            self._distributedRDD.combineByKey(create_combiner, merge_value,
                                              merge_combiner)).values()
        return PRDD.fromRDD(rddOfSum)
Exemplo n.º 20
0
    def last(self):
        """Pull out the last from each group."""
        myargs = self._myargs
        mykwargs = self._mykwargs

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).last()

        def merge_value(x, y):
            return create_combiner(y)

        def merge_combiner(x, y):
            return y

        rddOfLast = self._sortIfNeeded(
            self._distributedRDD.combineByKey(create_combiner, merge_value,
                                              merge_combiner)).values()
        return PRDD.fromRDD(rddOfLast)
Exemplo n.º 21
0
    def from_data_frame(self, df):
        """Make a distributed dataframe from a local dataframe. The intend use
        is for testing. Note: dtypes are re-infered, so they may not match."""
        mydtype = df.dtypes
        mycols = df.columns

        def loadFromKeyRow(partition):
            pll = list(partition)
            if len(pll) > 0:
                index, data = zip(*pll)
                return iter([
                    pandas.DataFrame(list(data),
                                     columns=mycols,
                                     index=index)])
            else:
                return iter([])
        indexedData = zip(df.index, df.itertuples(index=False))
        rdd = self.sc.parallelize(indexedData).mapPartitions(loadFromKeyRow)
        return PRDD.fromRDD(rdd)
Exemplo n.º 22
0
    def from_data_frame(self, df):
        """Make a distributed dataframe from a local dataframe. The intend use
        is for testing. Note: dtypes are re-infered, so they may not match."""
        mydtype = df.dtypes
        mycols = df.columns

        def loadFromKeyRow(partition):
            pll = list(partition)
            if len(pll) > 0:
                index, data = zip(*pll)
                return iter([
                    pandas.DataFrame(list(data), columns=mycols, index=index)
                ])
            else:
                return iter([])

        indexedData = zip(df.index, df.itertuples(index=False))
        rdd = self.sc.parallelize(indexedData).mapPartitions(loadFromKeyRow)
        return PRDD.fromRDD(rdd)
Exemplo n.º 23
0
    def sum(self):
        """Compute the sum for each group."""
        myargs = self._myargs
        mykwargs = self._mykwargs

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).sum()

        def merge_value(x, y):
            return pandas.concat([x, create_combiner(y)])

        def merge_combiner(x, y):
            return x + y

        rddOfSum = self._sortIfNeeded(self._distributedRDD.combineByKey(
            create_combiner,
            merge_value,
            merge_combiner)).values()
        return PRDD.fromRDD(rddOfSum)
Exemplo n.º 24
0
    def last(self):
        """Pull out the last from each group."""
        myargs = self._myargs
        mykwargs = self._mykwargs

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).last()

        def merge_value(x, y):
            return create_combiner(y)

        def merge_combiner(x, y):
            return y

        rddOfLast = self._sortIfNeeded(self._distributedRDD.combineByKey(
            create_combiner,
            merge_value,
            merge_combiner)).values()
        return PRDD.fromRDD(rddOfLast)
Exemplo n.º 25
0
    def max(self):
        """Compute the max for each group."""
        myargs = self._myargs
        mykwargs = self._mykwargs

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).max()

        def merge_value(x, y):
            return x.append(create_combiner(y)).max()

        def merge_combiner(x, y):
            return x.append(y).max(level=0)

        rddOfMax = self._sortIfNeeded(self._distributedRDD.combineByKey(
            create_combiner,
            merge_value,
            merge_combiner)).values()
        return PRDD.fromRDD(rddOfMax)
Exemplo n.º 26
0
    def first(self):
        """
        Pull out the first from each group. Note: this is different than
        Spark's first.
        """
        myargs = self._myargs
        mykwargs = self._mykwargs

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).first()

        def merge_value(x, y):
            return create_combiner(x)

        def merge_combiner(x, y):
            return x

        rddOfFirst = self._sortIfNeeded(
            self._distributedRDD.combineByKey(create_combiner, merge_value,
                                              merge_combiner)).values()
        return PRDD.fromRDD(rddOfFirst)
Exemplo n.º 27
0
    def first(self):
        """
        Pull out the first from each group. Note: this is different than
        Spark's first.
        """
        myargs = self._myargs
        mykwargs = self._mykwargs

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).first()

        def merge_value(x, y):
            return create_combiner(x)

        def merge_combiner(x, y):
            return x

        rddOfFirst = self._sortIfNeeded(self._distributedRDD.combineByKey(
            create_combiner,
            merge_value,
            merge_combiner)).values()
        return PRDD.fromRDD(rddOfFirst)
Exemplo n.º 28
0
    def apply(self, func, *args, **kwargs):
        """Apply the provided function and combine the results together in the
        same way as apply from groupby in pandas.

        This returns a PRDD.
        """
        def key_by_index(data):
            """Key each row by its index.
            """
            # TODO: Is there a better way to do this?
            for key, row in data.iterrows():
                yield (key, pandas.DataFrame.from_dict(dict([(key, row)]),
                                                       orient='index'))

        myargs = self._myargs
        mykwargs = self._mykwargs
        regroupedRDD = self._distributedRDD.mapValues(
            lambda data: data.groupby(*myargs, **mykwargs))
        appliedRDD = regroupedRDD.map(
            lambda key_data: key_data[1].apply(func, *args, **kwargs))
        reKeyedRDD = appliedRDD.flatMap(key_by_index)
        prdd = self._sortIfNeeded(reKeyedRDD).values()
        return PRDD.fromRDD(prdd)
Exemplo n.º 29
0
    def apply(self, func, *args, **kwargs):
        """Apply the provided function and combine the results together in the
        same way as apply from groupby in pandas.

        This returns a PRDD.
        """
        def key_by_index(data):
            """Key each row by its index.
            """
            # TODO: Is there a better way to do this?
            for key, row in data.iterrows():
                yield (key,
                       pandas.DataFrame.from_dict(dict([(key, row)]),
                                                  orient='index'))

        myargs = self._myargs
        mykwargs = self._mykwargs
        regroupedRDD = self._distributedRDD.mapValues(
            lambda data: data.groupby(*myargs, **mykwargs))
        appliedRDD = regroupedRDD.map(
            lambda key_data: key_data[1].apply(func, *args, **kwargs))
        reKeyedRDD = appliedRDD.flatMap(key_by_index)
        prdd = self._sortIfNeeded(reKeyedRDD).values()
        return PRDD.fromRDD(prdd)
Exemplo n.º 30
0
 def DataFrame(self, elements, *args, **kwargs):
     """
     Wraps the pandas.DataFrame operation.
     """
     return PRDD.fromRDD(self.sc.parallelize(elements).map(
         lambda element: pandas.DataFrame(data=[element], *args, **kwargs)))
Exemplo n.º 31
0
 def sql(self, query):
     """Perform a SQL query and create a L{PRDD} of the result."""
     return PRDD.fromRDD(
         self.from_schema_rdd(
             self._get_sqlctx().sql(query)))
Exemplo n.º 32
0
 def sql(self, query):
     """Perform a SQL query and create a L{PRDD} of the result."""
     return PRDD.fromRDD(self.from_schema_rdd(
         self._get_sqlctx().sql(query)))
Exemplo n.º 33
0
    def read_csv(self,
                 name,
                 use_whole_file=False,
                 names=None,
                 skiprows=0,
                 *args,
                 **kwargs):
        """Read a CSV file in and parse it into Pandas DataFrames.
        If no names is provided we use the first row for the names.
        header=0 is the default unless names is provided in which case
        header=None is the default.
        skiprows indicates how many rows of input to skip. This will
        only be applied to the first partition of the data (so if
        #skiprows > #row in first partition this will not work). Generally
        this shouldn't be an issue for small values of skiprows.
        No other values of header is supported.
        All additional parameters are passed to the read_csv function.
        """
        def csv_file(partitionNumber, files):
            file_count = 0
            for filename, contents in files:
                # Only skip lines on the first file
                if partitionNumber == 0 and file_count == 0 and _skiprows > 0:
                    yield pandas.read_csv(sio(contents),
                                          *args,
                                          header=None,
                                          names=mynames,
                                          skiprows=_skiprows,
                                          **kwargs)
                else:
                    file_count += 1
                    yield pandas.read_csv(sio(contents),
                                          *args,
                                          header=None,
                                          names=mynames,
                                          **kwargs)

        def csv_rows(partitionNumber, rows):
            rowCount = 0
            inputStr = "\n".join(rows)
            if partitionNumber == 0:
                return iter([
                    pandas.read_csv(sio(inputStr),
                                    *args,
                                    header=None,
                                    names=mynames,
                                    skiprows=_skiprows,
                                    **kwargs)
                ])
            else:
                # could use .iterows instead?
                return iter([
                    pandas.read_csv(sio(inputStr),
                                    *args,
                                    header=None,
                                    names=mynames,
                                    **kwargs)
                ])

        # If we need to peak at the first partition and determine the column
        # names
        mynames = None
        _skiprows = skiprows
        if names:
            mynames = names
        else:
            # In the future we could avoid this expensive call.
            first_line = self.sc.textFile(name).first()
            frame = pandas.read_csv(sio(first_line), **kwargs)
            mynames = list(frame.columns.values)
            _skiprows += 1

        # Do the actual load
        if use_whole_file:
            return PRDD.fromRDD(
                self.sc.wholeTextFiles(name).mapPartitionsWithIndex(csv_file))
        else:
            return PRDD.fromRDD(
                self.sc.textFile(name).mapPartitionsWithIndex(csv_rows))