Exemplo n.º 1
0
def from_hdfs(ic,
              path,
              schema,
              table=None,
              overwrite=False,
              file_format='TEXTFILE',
              partition_schema=None,
              field_terminator='\t',
              line_terminator='\n',
              escape_char='\\'):
    """Create a BDF backed by an external file in HDFS.

    File must be Impala-compatible
    """
    if partition_schema is not None:
        raise NotImplementedError(
            "Partitions not yet implemented in .from_hdfs()")
    if table is None:
        temp_table = _random_id('tmp_table_', 8)
        table = "%s.%s" % (ic._temp_db, temp_table)
    table_name = _to_TableName(table)
    if overwrite:
        ic._cursor.execute("DROP TABLE IF EXISTS %s" % table_name.to_sql())
    create_stmt = _create_table(table_name,
                                schema,
                                path=path,
                                file_format=file_format,
                                field_terminator=field_terminator,
                                line_terminator=line_terminator,
                                escape_char=escape_char)
    ic._cursor.execute(create_stmt)
    return from_sql_table(ic, table_name.to_sql())
Exemplo n.º 2
0
def from_pandas(ic, df, table=None, path=None, method='in_query',
                file_format='TEXTFILE', field_terminator='\t',
                line_terminator='\n', escape_char='\\', overwrite=False):
    """Create a BDF by shipping an in-memory pandas `DataFrame` into Impala

    path is the dir, not the filename
    """
    # TODO: this is not atomic
    assert isinstance(ic, ImpalaContext)
    temp_table = _random_id('tmp_table_', 8)
    if table is None:
        table = "%s.%s" % (ic._temp_db, temp_table)
    if path is None:
        path = os.path.join(ic._temp_dir, temp_table)
    table_name = _to_TableName(table)
    if overwrite:
        ic._cursor.execute("DROP TABLE IF EXISTS %s" % table_name.to_sql())
    columns = list(df.columns)
    types = [_numpy_dtype_to_impala_PrimitiveType(ty) for ty in df.dtypes]
    schema = zip(columns, types)
    create_stmt = _create_table(table_name, schema, path=path,
                                file_format=file_format,
                                field_terminator=field_terminator,
                                line_terminator=line_terminator,
                                escape_char=escape_char)
    ic._cursor.execute(create_stmt)
    if method == 'in_query':
        query = "INSERT INTO %s VALUES " % table_name.to_sql()
        query += ', '.join(['(%s)' % ', '.join(map(_py_to_sql_string, row))
                            for row in df.values])
        ic._cursor.execute(query)
    elif method == 'webhdfs':
        if file_format != 'TEXTFILE':
            raise ValueError("only TEXTFILE format supported for webhdfs")
        if path is None:
            raise ValueError(
                "must supply a path for EXTERNAL table for webhdfs")
        hdfs_client = ic.hdfs_client()
        raw_data = StringIO()
        df.to_csv(raw_data, sep=field_terminator,
                  line_terminator=line_terminator, quoting=csv.QUOTE_NONE,
                  escapechar=escape_char, header=False, index=False)
        hdfs_client.write(
            os.path.join(path, 'data.txt'), raw_data.getvalue(),
            overwrite=overwrite)
        raw_data.close()
    else:
        raise ValueError(
            "method must be 'in_query' or 'webhdfs'; got %s" % method)
    return from_sql_table(ic, table_name.to_sql())
Exemplo n.º 3
0
def from_hdfs(ic, path, schema, table=None, overwrite=False,
        file_format='TEXTFILE', partition_schema=None,
        field_terminator='\t', line_terminator='\n', escape_char='\\'):
    """Create a BDF backed by an external file in HDFS.

    File must be Impala-compatible
    """
    if partition_schema is not None:
        raise NotImplementedError("Partitions not yet implemented in .from_hdfs()")
    if table is None:
        temp_table = _random_id('tmp_table_', 8)
        table = "%s.%s" % (ic._temp_db, temp_table)
    table_name = _to_TableName(table)
    if overwrite:
        ic._cursor.execute("DROP TABLE IF EXISTS %s" % table_name.to_sql())
    create_stmt = _create_table(table_name, schema, path=path,
            file_format=file_format, field_terminator=field_terminator,
            line_terminator=line_terminator, escape_char=escape_char)
    ic._cursor.execute(create_stmt)
    return from_sql_table(ic, table_name.to_sql())