Пример #1
0
def read_table(source, columns=None, memory_map=True):
    """
    Read a pyarrow.Table from Feather format

    Parameters
    ----------
    source : str file path, or file-like object
    columns : sequence, optional
        Only read a specific set of columns. If not provided, all columns are
        read.
    memory_map : boolean, default True
        Use memory mapping when opening file on disk

    Returns
    -------
    table : pyarrow.Table
    """
    reader = ext.FeatherReader()
    reader.open(source, use_memory_map=memory_map)

    if columns is None:
        return reader.read()

    column_types = [type(column) for column in columns]
    if all(map(lambda t: t == int, column_types)):
        table = reader.read_indices(columns)
    elif all(map(lambda t: t == str, column_types)):
        table = reader.read_names(columns)
    else:
        column_type_names = [t.__name__ for t in column_types]
        raise TypeError("Columns must be indices or names. "
                        "Got columns {} of types {}"
                        .format(columns, column_type_names))

    # Feather v1 already respects the column selection
    if reader.version < 3:
        return table
    # Feather v2 reads with sorted / deduplicated selection
    elif sorted(set(columns)) == columns:
        return table
    else:
        # follow exact order / selection of names
        new_fields = [table.schema.field(c) for c in columns]
        new_schema = schema(new_fields, metadata=table.schema.metadata)
        new_columns = [table.column(c) for c in columns]
        return Table.from_arrays(new_columns, schema=new_schema)
Пример #2
0
    def read(self, columns=None, nthreads=1):
        if columns is not None:
            column_set = set(columns)
        else:
            column_set = None

        columns = []
        names = []
        for i in range(self.num_columns):
            name = self.get_column_name(i)
            if column_set is None or name in column_set:
                col = self.get_column(i)
                columns.append(col)
                names.append(name)

        table = Table.from_arrays(columns, names=names)
        return table.to_pandas(nthreads=nthreads)
Пример #3
0
    def read_table(self, columns=None):
        if columns is not None:
            column_set = set(columns)
        else:
            column_set = None

        columns = []
        names = []
        for i in range(self.num_columns):
            name = self.get_column_name(i)
            if column_set is None or name in column_set:
                col = self.get_column(i)
                columns.append(col)
                names.append(name)

        table = Table.from_arrays(columns, names=names)
        return table