예제 #1
0
    def columns(self, value):
        """Set self.columns."""
        if not isinstance(value, list):
            raise ValueError(
                'Columns must be a list of dicts with keys "name" and "dtype".'
            )
        if len(value) < 1:
            raise ValueError('At least one item must be in the list.')
        if not isinstance(value[0], dict) or "name" not in value[0].keys(
        ) or "dtype" not in value[0].keys():
            raise ValueError(
                'Columns must be a list of dicts with keys "name" and "dtype".'
            )

        try:
            _ = pd.concat([
                pd.Series(name=c['name'],
                          dtype=dtype_string_to_dtype_object(c['dtype']))
                for c in value
            ] + [pd.Series(name='uuid_in_df', dtype=str)],
                          axis=1)
        except KeyError as e:
            raise ValueError('Unrecognized value: {}'.format(str(e)))

        self._columns = value
예제 #2
0
    def _df_from_dicts(self, dicts):
        """Create a DataFrame from a list of dicts.

        Args:
            dicts (list): list of dicts

        Returns:
            (pd.DataFrame): a data-frame

        """
        columns = self._config['columns'] if 'columns' in self._config.keys(
        ) else []
        df = pd.concat(
            [
                pd.Series(name=c['name'],
                          dtype=dtype_string_to_dtype_object(c['dtype']))
                for c in columns
                if c['name'] != '_uuid' and c['name'] != '_creation_time'
            ]  # noqa: E501
            + [
                pd.Series(name='_uuid', dtype=str),
                pd.Series(name='_creation_time', dtype=float)
            ],
            axis=1)
        df.set_index('_uuid', inplace=True)
        df = pd.concat([df, pd.DataFrame.from_records(dicts)])
        return df
예제 #3
0
 def _initialize_df(self):
     """Initialize DF."""
     df = pd.concat([
         pd.Series(name=c['name'],
                   dtype=dtype_string_to_dtype_object(c['dtype']))
         for c in self.columns
     ] + [pd.Series(name='uuid_in_df', dtype=str)],
                    axis=1)
     return df
예제 #4
0
 def _initialize_df(self):
     """Initialize DF."""
     df = pd.concat(
         [pd.Series(name=c['name'],
                    dtype=dtype_string_to_dtype_object(c['dtype'])) for c in self.columns if c['name'] != 'uuid_in_df' and c['name'] != 'creation_time_in_df']  # noqa: E501
         + [pd.Series(name='uuid_in_df', dtype=str),
            pd.Series(name='creation_time_in_df', dtype=float)],
         axis=1
     )
     df.set_index('uuid_in_df', inplace=True)
     return df
예제 #5
0
    def _preprocess_list_of_dicts(self, data_in):
        """Preprocess list of dicts.

        Args:
            data_in (list): list of dicts containing data

        Returns:
            (dict): dict of lists = listed dict

        """
        data = deepcopy(data_in)

        # Serialize (convert list to str)
        self.logger.info('(Preprocess) Serializing...')
        for item in tqdm(data, desc='Serialization', leave=False):
            item = serialize_dict_1d(item)

            # Add df_uuid and creation_time_in_df
            item['uuid_in_df'] = self._get_uuid_from_item(item)
            item['creation_time_in_df'] = datetime.now().timestamp()

            # Add missing columns
            for column in self.columns:
                column_name, column_dtype = column['name'], column['dtype']
                if column_name not in item.keys():
                    item.update({column_name: None})
                else:
                    dtype_obj = dtype_string_to_dtype_object(column_dtype)
                    if dtype_obj is None:
                        continue
                    if item[column_name] is not None \
                       and not isinstance(item[column_name], dtype_obj):
                        try:
                            item[column_name] = dtype_obj(item[column_name])
                        except ValueError:
                            item[column_name] = np.nan

        # Convert dict to listed dict
        self.logger.info('(Preprocess) Converting...')
        data = dicts_to_listed_dict_2d(data)

        return data