Пример #1
0
 def __init__(self,
              data: Union[List[str], dict],
              preprocess_function=None,
              load_mode=False,
              use_unk=True):
     self._temp_unks = dict()
     self._hash = self.new_hash  # default
     self._unk_memory_max = 10000  # prevents memory leak
     self._use_unk = use_unk
     self._preprocess_function = preprocess_function
     self._warning_not_use_unk = False
     if isinstance(data, dict) and load_mode:
         logger.info("Building from file.")
         keys = data.keys()
         assert '_metadata' in keys and 'data' in keys, 'Invalid content.'
         for k, v in data['_metadata'].items():
             if k == '_preprocess_function':
                 # load function
                 if v is not None:
                     v = pickle.loads(string_to_literal(v))
             setattr(self, k, v)
         self._uniques = set(data['data'].values())
         self._index_to_item = {int(k): v for k, v in data['data'].items()}
     else:
         self._uniques = self.get_uniques(data)
         self._index_to_item = dict(enumerate(self._uniques, self._n_unks))
     self._item_to_index = invert_dict(self._index_to_item)
Пример #2
0
 def _normalize_row(self, row, fill_with=None, literal_values=True):
     if not is_sequence(row):
         row = [row]
     if self._cols:
         row = fill(value=list(row),
                    max_size=len(self.cols),
                    with_=fill_with) if len(row) < len(self.cols) else row
     return [
         string_to_literal(item) if isinstance(item, str) else item
         for item in row if literal_values
     ]
Пример #3
0
 def _convert(self, val):
     if self._is_byte and not isinstance(val, bytes):
         return str(val).encode()
     if not self._is_byte and isinstance(val, str):
         return string_to_literal(val)
     return val