def reencode_dictionaries(table: pa.Table) -> pa.Table: for i in range(table.num_columns): column = table.columns[i] if pa.types.is_dictionary(column.type): table = table.set_column( i, table.column_names[i], reencode_dictionary_array(column.chunks[0])) return table
def render_arrow_v1(table: pa.Table, params, **kwargs): todo = frozenset(params["colnames"]) for i, colname in enumerate(table.column_names): if colname not in todo: continue table = table.set_column( i, colname, format_chunked_array(table.column(i), table.schema.field(i))) return ArrowRenderResult(table)
def _cast_classlabels(self, pa_table: pa.Table) -> pa.Table: if self.config.features: # Encode column if ClassLabel for i, col in enumerate(self.config.features.keys()): if isinstance(self.config.features[col], datasets.ClassLabel): pa_table = pa_table.set_column( i, self.config.schema.field(col), [self.config.features[col].str2int(pa_table[col])]) # Cast allows str <-> int/float # Before casting, rearrange JSON field names to match passed features schema field names order pa_table = pa.Table.from_arrays( [pa_table[name] for name in self.config.features], schema=self.config.schema) return pa_table
def _cast_classlabels(self, pa_table: pa.Table) -> pa.Table: if self.config.features: # Encode column if ClassLabel for i, col in enumerate(self.config.features.keys()): if isinstance(self.config.features[col], datasets.ClassLabel): if pa_table[col].type == pa.string(): pa_table = pa_table.set_column( i, self.config.schema.field(col), [self.config.features[col].str2int(pa_table[col])]) elif pa_table[col].type != self.config.schema.field( col).type: raise ValueError( f"Field '{col}' from the JSON data of type {pa_table[col].type} is not compatible with ClassLabel. Compatible types are int64 and string." ) # Cast allows str <-> int/float or str to Audio for example pa_table = table_cast(pa_table, self.config.schema) return pa_table
def _cast_classlabels(self, pa_table: pa.Table) -> pa.Table: if self.config.features: # Encode column if ClassLabel for i, col in enumerate(self.config.features.keys()): if isinstance(self.config.features[col], datasets.ClassLabel): if pa_table[col].type == pa.string(): pa_table = pa_table.set_column( i, self.config.schema.field(col), [self.config.features[col].str2int(pa_table[col])]) elif pa_table[col].type != self.config.schema.field( col).type: raise ValueError( f"Field '{col}' from the JSON data of type {pa_table[col].type} is not compatible with ClassLabel. Compatible types are int64 and string." ) # Cast allows str <-> int/float # Before casting, rearrange JSON field names to match passed features schema field names order pa_table = pa.Table.from_arrays( [pa_table[name] for name in self.config.features], schema=self.config.schema) return pa_table
def _render_startof(table: pa.Table, colnames: List[str], unit: str) -> ArrowRenderResult: truncated = False for colname in colnames: i = table.column_names.index(colname) column_result = _startof(table.columns[i], unit) table = table.set_column(i, colname, column_result.column) if column_result.truncated: truncated = True if truncated: errors = [ RenderError( trans( "warning.convertedOutOfBoundsToNull", "Converted timestamp {timestamp} to null because it is out of bounds.", {"timestamp": _out_of_bounds_timestamp(unit)}, )) ] else: errors = [] return ArrowRenderResult(table, errors=errors)