示例#1
0
    def _manage_date_time(self):
        # TODO: check what happens with this columns in the cd_update and self.env.cols
        if 'DATE' not in self.df:
            lg.info('-- CREATE DATE COLUMN')
            if 'YEAR' in self.df and 'MONTH' in self.df and 'DAY' in self.df:
                try:
                    self.df = self.df.assign(
                        DATE=pd.to_datetime(self.df[['YEAR', 'MONTH', 'DAY']]).dt.strftime('%Y%m%d')
                    )
                except Exception as e:
                    raise ValidationError(
                        'DATE column, which is a required field, does not exist. Also, it could not be created'
                        ' from YEAR, MONTH and DAY columns possibly because some of the rows do not have any value.',
                        rollback=self.rollback
                    )
                self.add_moves_element(
                    'required_column_added',
                    'DATE column was automatically generated from the columns YEAR, MONTH and DAY'
                )
            else:
                raise ValidationError(
                    'DATE column, which is a required field, does not exist. Also, it could not be built'
                    ' with other columns (usually year, month and day).',
                    rollback=self.rollback
                )

        if 'TIME' in self.df:  # fill with zeros on the left: 132 >> 0132
            self.df['TIME'] = self.df[self.df['TIME'].notnull()]['TIME'].astype(float).apply(lambda x: f'{x:04.0f}')
示例#2
0
    def _validate_original_data(self):
        ''' Checks if all the rows have the same number of elements '''
        lg.info('-- CHECK DATA FORMAT (CSV)')
        with open(self.filepath_or_buffer, newline='', errors='surrogateescape') as csvfile:
            spamreader = csv.reader(csvfile, delimiter=',', quotechar='"')
            first_len = -1
            row_number = 1
            for row in spamreader:
                if row_number == 1 and '' in row:
                    csvfile.close()
                    raise ValidationError(
                        'Some header column name is missing: ROW = {} | COL = {}'.format(
                            row_number, row.index('') + 1
                        ),
                        rollback=self.rollback
                    )
                    break                               # interrupt for loop

                # NOTE: this code is used to check if all the rows have the same number
                #       of elements, if the cells are empty (",,") the value is fill with NaN

                if first_len == -1:
                    first_len = len(row)
                else:
                    if first_len != len(row):
                        csvfile.close()
                        raise ValidationError(
                            'There is an invalid number of fields ({}) in the row: {}.'
                            ' The number of header columns fields is: {}'.format(
                                len(row), row_number, first_len
                            ),
                            rollback=self.rollback
                        )
                        break                               # interrupt for loop
                row_number += 1
示例#3
0
 def _init_cruise_data(self, update=False):
     ''' Checks data type and instantiates the appropriate cruise data object
             `whp` and `raw_csv` (csv) >> process file from scratch and validate data
             `aqc` >> open directly
             @update - boolean, whether the instantiated object is to make comparisons or not
     '''
     lg.info('-- INIT CRUISE DATA OBJECT')
     if update:
         rollback = 'cd_update'
         working_dir = UPD
         cd_aux = True
     else:
         rollback = 'cd'
         working_dir = TMP
         cd_aux = False
     original_path = path.join(working_dir, 'original.csv')
     if path.isfile(original_path):
         if self._is_plain_text(original_path):
             cd = None
             is_whp_format = self._is_whp_format(original_path)
             if path.isfile(path.join(
                     working_dir, 'data.csv')):  # aqc or pending session
                 original_type = 'whp' if is_whp_format else 'csv'
                 cd = CruiseDataAQC(original_type=original_type,
                                    working_dir=working_dir,
                                    cd_aux=cd_aux)
             else:
                 if is_whp_format:
                     # generates data.csv from original.csv
                     cd = CruiseDataWHP(working_dir=working_dir,
                                        cd_aux=cd_aux)
                 else:
                     # the data.csv should be a copy of original.csv, at the beggining at least
                     cd = CruiseDataCSV(working_dir=working_dir,
                                        cd_aux=cd_aux)
         else:
             raise ValidationError(
                 'The file to open should be a CSV file.'
                 ' That is a plain text file with comma separate values.',
                 rollback=rollback)
         if not update:
             self.env.cruise_data = cd
         else:
             self.env.cd_aux = cd
     else:
         raise ValidationError('The file could not be open',
                               rollback=rollback)
示例#4
0
 def _validate_required_columns(self):
     lg.info('-- VALIDATE REQUIRED COLUMNS')
     required_columns = self.env.f_handler.get_custom_cols_by_attr('required')
     if (not set(self.get_cols_by_attrs('all')).issuperset(required_columns)):
         missing_columns = ', '.join(list(set(required_columns) - set(self.get_cols_by_attrs('all'))))
         raise ValidationError(
             'Missing required columns in the file: [{}]'.format(missing_columns),
             rollback=self.rollback
         )
 def proj_settings_cps(self):
     # TODO: this is executed many time when the app load a file, avoid multiple file reading to improve efficiency
     try:
         proj_settings = json.load(open(PROJ_SETTINGS))
         return proj_settings['computed_params'] if 'computed_params' in proj_settings else {}
     except Exception:
         raise ValidationError(
             'Project JSON settings file could be opened to process the calculated parameters',
             rollback='cd'  # TODO: only if we are loading the files in the initialization
         )
 def _validate_original_data(
         self):  # TODO: this should be in each cruise data class
     ''' Checks if all the rows have the same number of elements '''
     lg.info('-- CHECK DATA FORMAT (WHP)')
     with open(self.filepath_or_buffer,
               newline='',
               errors='surrogateescape') as csvfile:
         spamreader = csv.reader(
             csvfile, delimiter=',',
             quotechar='"')  # TODO: ignore comments with #
         first_len = -1
         row_number = 1
         header = True
         trim_excel_artifacts = re.compile(r'^[\s\"]*')
         for row in spamreader:
             row[0] = trim_excel_artifacts.sub('', row[0])
             if not (row[0].startswith('BOTTLE')
                     or row[0].startswith('END_DATA')
                     or row[0].startswith('#')):
                 if header is True and '' in row:
                     csvfile.close()
                     raise ValidationError(
                         'Some header column name is missing: FILE ROW = {} | COL = {}'
                         .format(row_number,
                                 row.index('') + 1),
                         rollback='cd')
                     break  # interrupt for loop
                 if header is True:
                     header = False
                 if first_len == -1:
                     first_len = len(row)
                 else:
                     if first_len != len(
                             row
                     ):  # TODO: empty fields in the csv should be filled by NaN values
                         csvfile.close()
                         raise ValidationError(
                             'There is an invalid number of fields ({}) in the row: {}.'
                             ' The number of header columns fields is: {}'.
                             format(len(row), row_number, first_len),
                             rollback=self.rollback)
                         break  # interrupt for loop
             row_number += 1
示例#7
0
 def _validate_required_columns(self):
     lg.info('-- VALIDATE REQUIRED COLUMNS')
     if (not set(
             self.get_cols_by_type('all')).issuperset(REQUIRED_COLUMNS)):
         missing_columns = ', '.join(
             list(
                 set(REQUIRED_COLUMNS) - set(self.get_cols_by_type('all'))))
         raise ValidationError(
             'Missing required columns in the file: [{}]'.format(
                 missing_columns),
             rollback=self.rollback)
示例#8
0
    def _create_date_column(self):
        # TODO: check what happens with this columns in the cd_update and self.env.cols

        cols = self.df.columns.tolist()
        if 'DATE' not in cols:
            lg.info('-- CREATE DATE COLUMN')
            if 'YEAR' in cols and 'MONTH' in cols and 'DAY' in cols:
                try:
                    self.df = self.df.assign(DATE=pd.to_datetime(self.df[
                        ['YEAR', 'MONTH', 'DAY']]).dt.strftime('%Y%m%d'))
                except Exception as e:
                    raise ValidationError(
                        'DATE column, which is a required field, does not exist. Also, it could not be created'
                        ' from YEAR, MONTH and DAY columns possibly because some of the rows do not have any value.',
                        rollback=self.rollback)
                self.add_moves_element(
                    'required_column_added',
                    'DATE column was automatically generated from the columns YEAR, MONTH and DAY'
                )
            elif 'DATE_YY' in cols and 'DATE_MM' in cols and 'DATE_DD' in cols:
                mapper = dict(
                    zip(('DATE_YY', 'DATE_MM', 'DATE_DD'),
                        ('YEAR', 'MONTH', 'DAY')))
                try:
                    self.df = self.df.assign(DATE=pd.to_datetime(self.df[
                        ['DATE_YY', 'DATE_MM', 'DATE_DD']].rename(
                            columns=mapper)).dt.strftime('%Y%m%d'))
                except Exception as e:
                    raise ValidationError(
                        'DATE column, which is a required field, does not exist. Also, it could not be created'
                        ' from DATE_YY, DATE_MM and DATE_DD columns possibly because some of the rows do not have any value.',
                        rollback=self.rollback)
                self.add_moves_element(
                    'required_column_added',
                    'DATE column was automatically generated from the columns DATE_YY, DATE_MM and DATE_DD'
                )
            else:
                raise ValidationError(
                    'DATE column, which is a required field, does not exist. Also, it could not be built'
                    ' with other columns (usually year, month and day).',
                    rollback=self.rollback)
示例#9
0
    def _validate_flag_values(self):
        ''' Assign 9 to the rows where the param has an NaN
            Also checks if there is any NaN or incorrect value in the flag columns
        '''
        lg.info('-- VALIDATE FLAG VALUES')
        for param in self.df:
            flag = param + FLAG_END
            if flag in self.df:
                upds = self.df[self.df[param].isnull() & (self.df[flag] != 9)].index.tolist()
                if len(upds) > 0:
                    empty_rows = {}
                    empty_rows[flag] = 9
                    self.df[self.df[param].isnull() & (self.df[flag] != 9)] = self.df[self.df[param].isnull() & (self.df[flag] != 9)].assign(**empty_rows)
                    self.add_moves_element(
                        'flag_column_updated',
                        f'The flag column {flag} had some NaN values in the related parameter column. '
                        f'It was set to the empty default value 9 in {len(upds)} rows.'
                    )

                # NOTE: if the flag value is NaN or is not between [0-9] > throw error or reset to 9?
                if self.df[flag].isnull().any():
                    raise ValidationError(
                        'The flag column {} has a/some null value/s in the row/s '
                        '(row position taking into account just the data cells): {}'.format(
                            flag, str(self.df[self.df[flag].isnull()].index.tolist())[1:-1]
                        ),
                        rollback=self.rollback
                    )
                if self.df[(self.df[flag] > 9) | (self.df[flag] < 0)].index.any():
                    raise ValidationError(
                        'The flag column {} must have values between 0-9 in the row '
                        '(row position taking into account just the data cells): {}'.format(
                            flag, str(self.df[(self.df[flag] > 9) | (self.df[flag] < 0)].index.tolist())[1:-1]
                        ),
                        rollback=self.rollback
                    )
 def _sanitize_original_csv(self):
     try:
         lg.info('-- TRYING TO SANITIZE UGLY EXCEL ARTIFACTS IN WHP FILES')
         with open(self.filepath_or_buffer, 'r',
                   errors='surrogateescape') as f:
             trim_excel_artifacts = re.compile(r'[\n\r][\s\"]*')
             buf = trim_excel_artifacts.sub('\n', f.read())
             buf = re.sub(r'[\n\r]END_DATA[\s\S]*', '\nEND_DATA', buf)
         with open(self.filepath_or_buffer, 'w',
                   errors='surrogateescape') as f:
             f.write(buf)
     except Exception as e:
         raise ValidationError(
             'Error trying to sanitize ugly excel artifacts in WHP files: {}'
             .format(e),
             rollback=self.rollback)
示例#11
0
    def get_cols_by_attrs(self, column_attrs=[], discard_nan=False):
        ''' Possible attrs:
                * computed      - calculated parameters
                * param         - parameters
                * non_qc        - params without qc column
                * flag          - existing flags for the params that were loaded from the beginning
                * required      - required columns
                * created       - columns created by the app
                * empty         - empty columns

            @discard_nan - discards columns with all the values = NaN

            NOTE: the result is an union of sets. For instance, if you need to get
                  the created flag columns you shoud make the difference between the
                  'flag' columns minus the 'created' columns. I could add another argument
                  to select the operation
        '''
        if isinstance(column_attrs, str):
            column_attrs = [column_attrs]
        if len(column_attrs) == 1 and 'all' in column_attrs:
            column_attrs = [
                'computed', 'param', 'non_qc',
                'flag', 'required', 'created'
            ]
        res = []
        for t in column_attrs:
            for c in self.cols:
                if t in self.cols[c]['attrs']:
                    if c not in res:
                        res.append(c)
        res = list(set(res))  # one column may have multiple attrs
        df_cols = list(self.df.columns)
        col_positions = dict(
            [(df_cols[df_cols.index(x)], df_cols.index(x)) for x in df_cols]  # {'COL1': 0, 'COL2': 1, ...}
        )
        try:
            prepaired_list = [(col_positions[x], x) for x in res]
        except Exception:
            raise ValidationError(
                'Some columns in the settings.json file or '
                'self.cols object is not in the DataFrame'
            )
        sorted_list = sorted(prepaired_list, key=lambda elem: elem[0])  # reordering
        final_list = [x[1] for x in sorted_list]
        if discard_nan:
            final_list = self._discard_nan_columns(final_list)
        return final_list
示例#12
0
    def get_cols_by_type(self, column_types=[], discard_nan=False):
        ''' Possible types:
                * computed      - calculated parameters
                * param         - parameters
                * non_qc_param  - params without qc column
                * param_flag    - existing flags for the params that were loaded from the beginning
                * qc_param_flag - flags that were created by the application with value 2
                * required      - required columns

            @discard_nan - discards columns with all the values = NaN

            NOTE: a flag param could have the types 'param_flag' and 'qc_param_flag' at the same time
        '''
        if isinstance(column_types, str):
            column_types = [column_types]
        if len(column_types) == 1 and 'all' in column_types:
            column_types = [
                'computed', 'param', 'non_qc_param', 'param_flag',
                'qc_param_flag', 'required'
            ]
        res = []
        for t in column_types:
            for c in self.cols:
                if t in self.cols[c]['types']:
                    if c not in res:
                        res.append(c)
        res = list(set(res))  # one column may have multiple types
        df_cols = list(self.df.columns)
        col_positions = dict([(df_cols[df_cols.index(x)], df_cols.index(x))
                              for x in df_cols]  # {'COL1': 0, 'COL2': 1, ...}
                             )
        try:
            prepaired_list = [(col_positions[x], x) for x in res]
        except Exception:
            raise ValidationError('Some column in the columns.json file or '
                                  'self.cols object is not in the DataFrame')
        sorted_list = sorted(prepaired_list,
                             key=lambda elem: elem[0])  # reordering
        final_list = [x[1] for x in sorted_list]
        if discard_nan:
            final_list = self._discard_nan_columns(final_list)
        return final_list
示例#13
0
 def _create_date_column(self):
     if 'DATE' not in self.df.columns.tolist():
         lg.info(
             '>> Generating the DATE column from YEAR MONTH and DAY columns'
         )
         try:
             self.df = self.df.assign(DATE=pd.to_datetime(self.df[
                 ['YEAR', 'MONTH', 'DAY']]).dt.strftime('%Y%m%d'))
             self.add_moves_element(
                 'required_column_added',
                 'DATE column was automatically generated from the columns YEAR, MONTH and DAY'
             )
         except Exception as e:
             missing_columns = ', '.join(
                 list(
                     set(REQUIRED_COLUMNS) -
                     set(self.get_cols_by_type('all'))))
             raise ValidationError(
                 'DATE column did not exist and it could not be created. And that is a required column'
                 ' from YEAR, MONTH and DAY columns: [{}]'.format(
                     missing_columns),
                 rollback=self.rollback)
示例#14
0
 def get_cruise_data_columns(self):
     lg.info('-- GET CRUISE DATA COLUMNS')
     if self.env.cruise_data is None:
         self._init_cruise_data()
     params = self.env.cruise_data.get_cols_by_type('param',
                                                    discard_nan=True)
     if len(params) == 0:
         raise ValidationError(
             'There should be at least one parameter with data, '
             'in addition to the required columns and the parameters that'
             ' should not have a QC column associated.',
             rollback='cd')
     d = {
         'cps':
         self.env.cruise_data.get_cols_by_type('computed'),
         'cols':
         self.env.cruise_data.get_cols_by_type(
             ['param', 'param_flag', 'qc_param_flag', 'computed'],
             discard_nan=True),
         'params':
         params
     }
     return d