def __getitem__(self, obj): """'Indexing' functionality for the BigDataFrame Given a single object or list, the BDF will interpret it as a relational projection (i.e., a selection of columns). Given a tuple of length 2, the first element will be interpreted for row selection (i.e., predicate/filter/WHERE clause), while the second element will be interpreted as a projection. """ # other select/filter fns should be implemented with this one if isinstance(obj, tuple) and len(obj) == 2: alias = _random_id('inline_', 4) table_ref = InlineView(self._query_ast.to_sql(), alias) (limit_elt, where) = self._query_ast._filter(obj[0]) select_list = self._query_ast._projection(obj[1]) return BigDataFrame( self._ic, SelectStmt(select_list, table_ref, where=where, limit=limit_elt)) elif isinstance(obj, list): alias = _random_id('inline_', 4) table_ref = InlineView(self._query_ast.to_sql(), alias) select_list = self._query_ast._projection(obj) return BigDataFrame(self._ic, SelectStmt(select_list, table_ref)) else: # single object, possibly a slice; wrap in list and get projection return self[[obj]]
def schema(self): if self._schema is None: table_ref = InlineView(self._query_ast.to_sql(), _random_id('inline_', 4)) self._schema = _get_table_schema_hack(self._ic._cursor, table_ref.to_sql()) return self._schema
def from_sql_query(ic, query, alias=None): """Create a BDF from a SQL query executed by Impala""" query_alias = alias if alias else _random_id('inline_', 4) table_ref = InlineView(query, query_alias) schema = _get_table_schema_hack(ic._cursor, table_ref.to_sql()) select_list = tuple([SelectItem(expr=Literal(col)) for (col, ty) in schema]) return BigDataFrame(ic, SelectStmt(select_list, table_ref))
def schema(self): if self._schema is None: table_ref = InlineView( self._query_ast.to_sql(), _random_id('inline_', 4)) self._schema = _get_table_schema_hack( self._ic._cursor, table_ref.to_sql()) return self._schema
def from_sql_query(ic, query, alias=None): """Create a BDF from a SQL query executed by Impala""" query_alias = alias if alias else _random_id('inline_', 4) table_ref = InlineView(query, query_alias) schema = _get_table_schema_hack(ic._cursor, table_ref.to_sql()) select_list = tuple( [SelectItem(expr=Literal(col)) for (col, ty) in schema]) return BigDataFrame(ic, SelectStmt(select_list, table_ref))
def join(self, other, on=None, how='inner', hint=None): """Join this BDF to another one. `on` is `None`, `string`, `Expr`, or `list[string]` """ left = InlineView(self._query_ast.to_sql(), 'left_tbl') right = InlineView(other._query_ast.to_sql(), 'right_tbl') # SELECT left.*, right.* select_list = [SelectItem(table_name=TableName(left.name)), SelectItem(table_name=TableName(right.name))] table_ref = JoinTableRef(left, right, on=on, op=how, hint=hint) ast = SelectStmt(select_list, table_ref) return BigDataFrame(self._ic, ast)
def take(self, n): """Return `n` rows as a pandas `DataFrame` Distributed and no notion of order, so not guaranteed to be reproducible. """ alias = _random_id('inline_', 4) table_ref = InlineView(self._query_ast.to_sql(), alias) # SELECT alias.* select_list = [SelectItem(table_name=TableName(table_ref.name))] limit_elt = LimitElement(Literal(n), None) ast = SelectStmt(select_list, table_ref, limit=limit_elt) bdf = BigDataFrame(self._ic, ast) return as_pandas(bdf.__iter__())
def group_by(self, by): """Group the BDF `by` is `string`, `Expr`, or `list/tuple[string/Expr]` """ if not isinstance(by, (tuple, list)): by = (by, ) if not all([isinstance(e, (basestring, Expr)) for e in by]): raise ValueError("must supply only strings or Exprs") by = tuple([e if isinstance(e, Expr) else Literal(e) for e in by]) table_ref = InlineView(self._query_ast.to_sql(), 'inner_tbl') # invalid AST; to be used by GroupBy incomplete_ast = SelectStmt([], table_ref, group_by=by) return GroupBy(self._ic, incomplete_ast)