def from_hdfs(ic, path, schema, table=None, overwrite=False, file_format='TEXTFILE', partition_schema=None, field_terminator='\t', line_terminator='\n', escape_char='\\'): """Create a BDF backed by an external file in HDFS. File must be Impala-compatible """ if partition_schema is not None: raise NotImplementedError( "Partitions not yet implemented in .from_hdfs()") if table is None: temp_table = _random_id('tmp_table_', 8) table = "%s.%s" % (ic._temp_db, temp_table) table_name = _to_TableName(table) if overwrite: ic._cursor.execute("DROP TABLE IF EXISTS %s" % table_name.to_sql()) create_stmt = _create_table(table_name, schema, path=path, file_format=file_format, field_terminator=field_terminator, line_terminator=line_terminator, escape_char=escape_char) ic._cursor.execute(create_stmt) return from_sql_table(ic, table_name.to_sql())
def from_pandas(ic, df, table=None, path=None, method='in_query', file_format='TEXTFILE', field_terminator='\t', line_terminator='\n', escape_char='\\', overwrite=False): """Create a BDF by shipping an in-memory pandas `DataFrame` into Impala path is the dir, not the filename """ # TODO: this is not atomic assert isinstance(ic, ImpalaContext) temp_table = _random_id('tmp_table_', 8) if table is None: table = "%s.%s" % (ic._temp_db, temp_table) if path is None: path = os.path.join(ic._temp_dir, temp_table) table_name = _to_TableName(table) if overwrite: ic._cursor.execute("DROP TABLE IF EXISTS %s" % table_name.to_sql()) columns = list(df.columns) types = [_numpy_dtype_to_impala_PrimitiveType(ty) for ty in df.dtypes] schema = zip(columns, types) create_stmt = _create_table(table_name, schema, path=path, file_format=file_format, field_terminator=field_terminator, line_terminator=line_terminator, escape_char=escape_char) ic._cursor.execute(create_stmt) if method == 'in_query': query = "INSERT INTO %s VALUES " % table_name.to_sql() query += ', '.join(['(%s)' % ', '.join(map(_py_to_sql_string, row)) for row in df.values]) ic._cursor.execute(query) elif method == 'webhdfs': if file_format != 'TEXTFILE': raise ValueError("only TEXTFILE format supported for webhdfs") if path is None: raise ValueError( "must supply a path for EXTERNAL table for webhdfs") hdfs_client = ic.hdfs_client() raw_data = StringIO() df.to_csv(raw_data, sep=field_terminator, line_terminator=line_terminator, quoting=csv.QUOTE_NONE, escapechar=escape_char, header=False, index=False) hdfs_client.write( os.path.join(path, 'data.txt'), raw_data.getvalue(), overwrite=overwrite) raw_data.close() else: raise ValueError( "method must be 'in_query' or 'webhdfs'; got %s" % method) return from_sql_table(ic, table_name.to_sql())
def from_hdfs(ic, path, schema, table=None, overwrite=False, file_format='TEXTFILE', partition_schema=None, field_terminator='\t', line_terminator='\n', escape_char='\\'): """Create a BDF backed by an external file in HDFS. File must be Impala-compatible """ if partition_schema is not None: raise NotImplementedError("Partitions not yet implemented in .from_hdfs()") if table is None: temp_table = _random_id('tmp_table_', 8) table = "%s.%s" % (ic._temp_db, temp_table) table_name = _to_TableName(table) if overwrite: ic._cursor.execute("DROP TABLE IF EXISTS %s" % table_name.to_sql()) create_stmt = _create_table(table_name, schema, path=path, file_format=file_format, field_terminator=field_terminator, line_terminator=line_terminator, escape_char=escape_char) ic._cursor.execute(create_stmt) return from_sql_table(ic, table_name.to_sql())