예제 #1
0
def reencode_dictionaries(table: pa.Table) -> pa.Table:
    for i in range(table.num_columns):
        column = table.columns[i]
        if pa.types.is_dictionary(column.type):
            table = table.set_column(
                i, table.column_names[i],
                reencode_dictionary_array(column.chunks[0]))
    return table
예제 #2
0
def render_arrow_v1(table: pa.Table, params, **kwargs):
    todo = frozenset(params["colnames"])

    for i, colname in enumerate(table.column_names):
        if colname not in todo:
            continue

        table = table.set_column(
            i, colname,
            format_chunked_array(table.column(i), table.schema.field(i)))

    return ArrowRenderResult(table)
예제 #3
0
 def _cast_classlabels(self, pa_table: pa.Table) -> pa.Table:
     if self.config.features:
         # Encode column if ClassLabel
         for i, col in enumerate(self.config.features.keys()):
             if isinstance(self.config.features[col], datasets.ClassLabel):
                 pa_table = pa_table.set_column(
                     i, self.config.schema.field(col),
                     [self.config.features[col].str2int(pa_table[col])])
         # Cast allows str <-> int/float
         # Before casting, rearrange JSON field names to match passed features schema field names order
         pa_table = pa.Table.from_arrays(
             [pa_table[name] for name in self.config.features],
             schema=self.config.schema)
     return pa_table
예제 #4
0
 def _cast_classlabels(self, pa_table: pa.Table) -> pa.Table:
     if self.config.features:
         # Encode column if ClassLabel
         for i, col in enumerate(self.config.features.keys()):
             if isinstance(self.config.features[col], datasets.ClassLabel):
                 if pa_table[col].type == pa.string():
                     pa_table = pa_table.set_column(
                         i, self.config.schema.field(col),
                         [self.config.features[col].str2int(pa_table[col])])
                 elif pa_table[col].type != self.config.schema.field(
                         col).type:
                     raise ValueError(
                         f"Field '{col}' from the JSON data of type {pa_table[col].type} is not compatible with ClassLabel. Compatible types are int64 and string."
                     )
         # Cast allows str <-> int/float or str to Audio for example
         pa_table = table_cast(pa_table, self.config.schema)
     return pa_table
예제 #5
0
 def _cast_classlabels(self, pa_table: pa.Table) -> pa.Table:
     if self.config.features:
         # Encode column if ClassLabel
         for i, col in enumerate(self.config.features.keys()):
             if isinstance(self.config.features[col], datasets.ClassLabel):
                 if pa_table[col].type == pa.string():
                     pa_table = pa_table.set_column(
                         i, self.config.schema.field(col),
                         [self.config.features[col].str2int(pa_table[col])])
                 elif pa_table[col].type != self.config.schema.field(
                         col).type:
                     raise ValueError(
                         f"Field '{col}' from the JSON data of type {pa_table[col].type} is not compatible with ClassLabel. Compatible types are int64 and string."
                     )
         # Cast allows str <-> int/float
         # Before casting, rearrange JSON field names to match passed features schema field names order
         pa_table = pa.Table.from_arrays(
             [pa_table[name] for name in self.config.features],
             schema=self.config.schema)
     return pa_table
예제 #6
0
def _render_startof(table: pa.Table, colnames: List[str],
                    unit: str) -> ArrowRenderResult:
    truncated = False
    for colname in colnames:
        i = table.column_names.index(colname)
        column_result = _startof(table.columns[i], unit)
        table = table.set_column(i, colname, column_result.column)
        if column_result.truncated:
            truncated = True

    if truncated:
        errors = [
            RenderError(
                trans(
                    "warning.convertedOutOfBoundsToNull",
                    "Converted timestamp {timestamp} to null because it is out of bounds.",
                    {"timestamp": _out_of_bounds_timestamp(unit)},
                ))
        ]
    else:
        errors = []

    return ArrowRenderResult(table, errors=errors)