def from_hdfs(ic, path, schema, table=None, overwrite=False, file_format='TEXTFILE', partition_schema=None, field_terminator='\t', line_terminator='\n', escape_char='\\'): """Create a BDF backed by an external file in HDFS. File must be Impala-compatible """ if partition_schema is not None: raise NotImplementedError( "Partitions not yet implemented in .from_hdfs()") if table is None: temp_table = _random_id('tmp_table_', 8) table = "%s.%s" % (ic._temp_db, temp_table) table_name = _to_TableName(table) if overwrite: ic._cursor.execute("DROP TABLE IF EXISTS %s" % table_name.to_sql()) create_stmt = _create_table(table_name, schema, path=path, file_format=file_format, field_terminator=field_terminator, line_terminator=line_terminator, escape_char=escape_char) ic._cursor.execute(create_stmt) return from_sql_table(ic, table_name.to_sql())
def store(self, path=None, table=None, file_format='TEXTFILE', field_terminator='\t', line_terminator='\n', escape_char='\\', overwrite=False): """ Materialize the results and stores them in HFDS. Functions as an EXTERNAL table. Implemented through a `CREATE TABLE AS SELECT`. """ temp_table = _random_id('tmp_table_', 8) if table is None: table = "%s.%s" % (self._temp_db, temp_table) if path is None: path = os.path.join(self._temp_dir, temp_table) table_name = _to_TableName(table) return self._store(path=path, table_name=table_name, file_format=file_format, field_terminator=field_terminator, line_terminator=line_terminator, escape_char=escape_char, overwrite=overwrite)
def store(self, path=None, table=None, file_format='TEXTFILE', field_terminator='\t', line_terminator='\n', escape_char='\\', overwrite=False): """Materialize the results and stores them in HFDS Implemented through a `CREATE TABLE AS SELECT`. """ temp_table = _random_id('tmp_table_', 8) if table is None: table = "%s.%s" % (self._temp_db, temp_table) if path is None: path = os.path.join(self._temp_dir, temp_table) table_name = _to_TableName(table) if overwrite: self._cursor.execute("DROP TABLE IF EXISTS %s" % table_name.to_sql()) create_stmt = _create_table_as_select( table_name, path=path, file_format=file_format, field_terminator=field_terminator, line_terminator=line_terminator, escape_char=escape_char) query = create_stmt + self.to_sql() self._cursor.execute(query) return from_sql_table(self._ic, table_name.to_sql())
def from_sql_table(ic, table): """Create a BDF from a table name usable in Impala""" table_name = _to_TableName(table) table_ref = BaseTableRef(table_name) schema = _get_table_schema_hack(ic._cursor, table_ref.to_sql()) select_list = tuple([SelectItem(expr=Literal(col)) for (col, ty) in schema]) return BigDataFrame(ic, SelectStmt(select_list, table_ref))
def from_sql_table(ic, table): """Create a BDF from a table name usable in Impala""" table_name = _to_TableName(table) table_ref = BaseTableRef(table_name) schema = _get_table_schema_hack(ic._cursor, table_ref.to_sql()) select_list = tuple( [SelectItem(expr=Literal(col)) for (col, ty) in schema]) return BigDataFrame(ic, SelectStmt(select_list, table_ref))
def store_managed(self, table, file_format='PARQUET', field_terminator='\t', line_terminator='\n', escape_char='\\', overwrite=False): """Materialize the results and stores them in HDFS as an impala managed table. Implemented through a `CREATE TABLE AS SELECT`. """ table_name = _to_TableName(table) return self._store(path=None, table_name=table_name, file_format=file_format, field_terminator=field_terminator, line_terminator=line_terminator, escape_char=escape_char, overwrite=overwrite)
def save_view(self, name, overwrite=False): """Create a named view representing this BDF for later reference""" # TODO: is this fn useful? table_name = _to_TableName(name) if overwrite: self._ic._cursor.execute('DROP VIEW IF EXISTS %s' % table_name.to_sql()) sql = 'CREATE VIEW %s AS %s' % (table_name.to_sql(), self._query_ast.to_sql()) self._ic._cursor.execute(sql) return from_sql_table(self._ic, table_name.to_sql())
def from_pandas(ic, df, table=None, path=None, method='in_query', file_format='TEXTFILE', field_terminator='\t', line_terminator='\n', escape_char='\\', overwrite=False): """Create a BDF by shipping an in-memory pandas `DataFrame` into Impala path is the dir, not the filename """ # TODO: this is not atomic assert isinstance(ic, ImpalaContext) temp_table = _random_id('tmp_table_', 8) if table is None: table = "%s.%s" % (ic._temp_db, temp_table) if path is None: path = os.path.join(ic._temp_dir, temp_table) table_name = _to_TableName(table) if overwrite: ic._cursor.execute("DROP TABLE IF EXISTS %s" % table_name.to_sql()) columns = list(df.columns) types = [_numpy_dtype_to_impala_PrimitiveType(ty) for ty in df.dtypes] schema = zip(columns, types) create_stmt = _create_table(table_name, schema, path=path, file_format=file_format, field_terminator=field_terminator, line_terminator=line_terminator, escape_char=escape_char) ic._cursor.execute(create_stmt) if method == 'in_query': query = "INSERT INTO %s VALUES " % table_name.to_sql() query += ', '.join(['(%s)' % ', '.join(map(_py_to_sql_string, row)) for row in df.values]) ic._cursor.execute(query) elif method == 'webhdfs': if file_format != 'TEXTFILE': raise ValueError("only TEXTFILE format supported for webhdfs") if path is None: raise ValueError( "must supply a path for EXTERNAL table for webhdfs") hdfs_client = ic.hdfs_client() raw_data = StringIO() df.to_csv(raw_data, sep=field_terminator, line_terminator=line_terminator, quoting=csv.QUOTE_NONE, escapechar=escape_char, header=False, index=False) hdfs_client.write( os.path.join(path, 'data.txt'), raw_data.getvalue(), overwrite=overwrite) raw_data.close() else: raise ValueError( "method must be 'in_query' or 'webhdfs'; got %s" % method) return from_sql_table(ic, table_name.to_sql())
def store(self, path=None, table=None, file_format='TEXTFILE', field_terminator='\t', line_terminator='\n', escape_char='\\', overwrite=False): """Materialize the results and stores them in HFDS. Functions as an EXTERNAL table. Implemented through a `CREATE TABLE AS SELECT`. """ temp_table = _random_id('tmp_table_', 8) if table is None: table = "%s.%s" % (self._temp_db, temp_table) if path is None: path = os.path.join(self._temp_dir, temp_table) table_name = _to_TableName(table) return self._store(path=path, table_name=table_name, file_format=file_format, field_terminator=field_terminator, line_terminator=line_terminator, escape_char=escape_char, overwrite=overwrite)
def from_hdfs(ic, path, schema, table=None, overwrite=False, file_format='TEXTFILE', partition_schema=None, field_terminator='\t', line_terminator='\n', escape_char='\\'): """Create a BDF backed by an external file in HDFS. File must be Impala-compatible """ if partition_schema is not None: raise NotImplementedError("Partitions not yet implemented in .from_hdfs()") if table is None: temp_table = _random_id('tmp_table_', 8) table = "%s.%s" % (ic._temp_db, temp_table) table_name = _to_TableName(table) if overwrite: ic._cursor.execute("DROP TABLE IF EXISTS %s" % table_name.to_sql()) create_stmt = _create_table(table_name, schema, path=path, file_format=file_format, field_terminator=field_terminator, line_terminator=line_terminator, escape_char=escape_char) ic._cursor.execute(create_stmt) return from_sql_table(ic, table_name.to_sql())
def store(self, path=None, table=None, file_format='TEXTFILE', field_terminator='\t', line_terminator='\n', escape_char='\\', overwrite=False): """Materialize the results and stores them in HFDS Implemented through a `CREATE TABLE AS SELECT`. """ temp_table = _random_id('tmp_table_', 8) if table is None: table = "%s.%s" % (self._temp_db, temp_table) if path is None: path = os.path.join(self._temp_dir, temp_table) table_name = _to_TableName(table) if overwrite: self._cursor.execute("DROP TABLE IF EXISTS %s" % table_name.to_sql()) create_stmt = _create_table_as_select(table_name, path=path, file_format=file_format, field_terminator=field_terminator, line_terminator=line_terminator, escape_char=escape_char) query = create_stmt + self.to_sql() self._cursor.execute(query) return from_sql_table(self._ic, table_name.to_sql())