Exemplo n.º 1
0
    def select(self, *exprs):
        cols = [parse(e) for e in exprs]

        if any(col.is_an_aggregation for col in cols):
            df_as_group = InternalGroupedDataFrame(self, [])
            return df_as_group.agg(exprs)

        def select_mapper(partition_index, partition):
            # Initialize non deterministic functions so that they are reproducible
            initialized_cols = [
                col.initialize(partition_index) for col in cols
            ]
            generators = [
                col for col in initialized_cols if col.may_output_multiple_rows
            ]
            non_generators = [
                col for col in initialized_cols
                if not col.may_output_multiple_rows
            ]
            number_of_generators = len(generators)
            if number_of_generators > 1:
                raise Exception(
                    "Only one generator allowed per select clause but found {0}: {1}"
                    .format(number_of_generators, ", ".join(generators)))

            return self.get_select_output_field_lists(
                partition, non_generators, initialized_cols,
                generators[0] if generators else None)

        new_schema = get_schema_from_cols(cols, self.bound_schema)
        return self._with_rdd(self._rdd.mapPartitionsWithIndex(select_mapper),
                              schema=new_schema)
Exemplo n.º 2
0
    def describe(self, cols):
        stat_helper = self.get_stat_helper(cols)
        exprs = [parse(col) for col in cols]

        return DataFrameInternal(self._sc,
                                 self._sc.parallelize(
                                     stat_helper.get_as_rows()),
                                 schema=self.get_summary_schema(exprs))
Exemplo n.º 3
0
    def sortWithinPartitions(self, cols, ascending):
        key = get_keyfunc([parse(c) for c in cols], self.bound_schema)

        def partition_sort(data):
            return sorted(data, key=key, reverse=not ascending)

        return self._with_rdd(self._rdd.mapPartitions(partition_sort),
                              self.bound_schema)
Exemplo n.º 4
0
 def summary(self, statistics):
     stat_helper = self.get_stat_helper(["*"])
     if not statistics:
         statistics = ("count", "mean", "stddev", "min", "25%", "50%",
                       "75%", "max")
     return DataFrameInternal(self._sc,
                              self._sc.parallelize(
                                  stat_helper.get_as_rows(statistics)),
                              schema=self.get_summary_schema([parse("*")]))
Exemplo n.º 5
0
    def filter(self, condition):
        condition = parse(condition)

        def mapper(partition_index, partition):
            initialized_condition = condition.initialize(partition_index)
            return (row for row in partition
                    if initialized_condition.eval(row, self.bound_schema))

        return self._with_rdd(self._rdd.mapPartitionsWithIndex(mapper),
                              self.bound_schema)
Exemplo n.º 6
0
    def drop(self, cols):
        positions_to_drop = []
        for col in cols:
            if isinstance(col, str):
                if col == "*":
                    continue
                col = parse(col)
            try:
                positions_to_drop.append(
                    col.find_position_in_schema(self.bound_schema))
            except ValueError:
                pass

        new_schema = StructType([
            field for i, field in enumerate(self.bound_schema.fields)
            if i not in positions_to_drop
        ])

        return self._with_rdd(
            self.rdd().map(lambda row: row_from_keyed_values(
                [(field, row[i]) for i, field in enumerate(row.__fields__)
                 if i not in positions_to_drop])), new_schema)
Exemplo n.º 7
0
 def withColumn(self, colName, col):
     return self.select(parse("*"), parse(col).alias(colName))