def columns(self, value): """Set self.columns.""" if not isinstance(value, list): raise ValueError( 'Columns must be a list of dicts with keys "name" and "dtype".' ) if len(value) < 1: raise ValueError('At least one item must be in the list.') if not isinstance(value[0], dict) or "name" not in value[0].keys( ) or "dtype" not in value[0].keys(): raise ValueError( 'Columns must be a list of dicts with keys "name" and "dtype".' ) try: _ = pd.concat([ pd.Series(name=c['name'], dtype=dtype_string_to_dtype_object(c['dtype'])) for c in value ] + [pd.Series(name='uuid_in_df', dtype=str)], axis=1) except KeyError as e: raise ValueError('Unrecognized value: {}'.format(str(e))) self._columns = value
def _df_from_dicts(self, dicts): """Create a DataFrame from a list of dicts. Args: dicts (list): list of dicts Returns: (pd.DataFrame): a data-frame """ columns = self._config['columns'] if 'columns' in self._config.keys( ) else [] df = pd.concat( [ pd.Series(name=c['name'], dtype=dtype_string_to_dtype_object(c['dtype'])) for c in columns if c['name'] != '_uuid' and c['name'] != '_creation_time' ] # noqa: E501 + [ pd.Series(name='_uuid', dtype=str), pd.Series(name='_creation_time', dtype=float) ], axis=1) df.set_index('_uuid', inplace=True) df = pd.concat([df, pd.DataFrame.from_records(dicts)]) return df
def _initialize_df(self): """Initialize DF.""" df = pd.concat([ pd.Series(name=c['name'], dtype=dtype_string_to_dtype_object(c['dtype'])) for c in self.columns ] + [pd.Series(name='uuid_in_df', dtype=str)], axis=1) return df
def _initialize_df(self): """Initialize DF.""" df = pd.concat( [pd.Series(name=c['name'], dtype=dtype_string_to_dtype_object(c['dtype'])) for c in self.columns if c['name'] != 'uuid_in_df' and c['name'] != 'creation_time_in_df'] # noqa: E501 + [pd.Series(name='uuid_in_df', dtype=str), pd.Series(name='creation_time_in_df', dtype=float)], axis=1 ) df.set_index('uuid_in_df', inplace=True) return df
def _preprocess_list_of_dicts(self, data_in): """Preprocess list of dicts. Args: data_in (list): list of dicts containing data Returns: (dict): dict of lists = listed dict """ data = deepcopy(data_in) # Serialize (convert list to str) self.logger.info('(Preprocess) Serializing...') for item in tqdm(data, desc='Serialization', leave=False): item = serialize_dict_1d(item) # Add df_uuid and creation_time_in_df item['uuid_in_df'] = self._get_uuid_from_item(item) item['creation_time_in_df'] = datetime.now().timestamp() # Add missing columns for column in self.columns: column_name, column_dtype = column['name'], column['dtype'] if column_name not in item.keys(): item.update({column_name: None}) else: dtype_obj = dtype_string_to_dtype_object(column_dtype) if dtype_obj is None: continue if item[column_name] is not None \ and not isinstance(item[column_name], dtype_obj): try: item[column_name] = dtype_obj(item[column_name]) except ValueError: item[column_name] = np.nan # Convert dict to listed dict self.logger.info('(Preprocess) Converting...') data = dicts_to_listed_dict_2d(data) return data