def __init__(self, data: Union[List[str], dict], preprocess_function=None, load_mode=False, use_unk=True): self._temp_unks = dict() self._hash = self.new_hash # default self._unk_memory_max = 10000 # prevents memory leak self._use_unk = use_unk self._preprocess_function = preprocess_function self._warning_not_use_unk = False if isinstance(data, dict) and load_mode: logger.info("Building from file.") keys = data.keys() assert '_metadata' in keys and 'data' in keys, 'Invalid content.' for k, v in data['_metadata'].items(): if k == '_preprocess_function': # load function if v is not None: v = pickle.loads(string_to_literal(v)) setattr(self, k, v) self._uniques = set(data['data'].values()) self._index_to_item = {int(k): v for k, v in data['data'].items()} else: self._uniques = self.get_uniques(data) self._index_to_item = dict(enumerate(self._uniques, self._n_unks)) self._item_to_index = invert_dict(self._index_to_item)
def _normalize_row(self, row, fill_with=None, literal_values=True): if not is_sequence(row): row = [row] if self._cols: row = fill(value=list(row), max_size=len(self.cols), with_=fill_with) if len(row) < len(self.cols) else row return [ string_to_literal(item) if isinstance(item, str) else item for item in row if literal_values ]
def _convert(self, val): if self._is_byte and not isinstance(val, bytes): return str(val).encode() if not self._is_byte and isinstance(val, str): return string_to_literal(val) return val