예제 #1
0
    def find_modify(self,
                    index_id,
                    operation,
                    columns,
                    modify_operation,
                    modify_columns=[],
                    limit=0,
                    offset=0):
        """Updates/deletes row(s) using opened index.

        Returns number of modified rows or a list of original values in case
        ``modify_operation`` ends with ``?``.

        Raises ``ValueError`` if given data doesn't validate.

        :param integer index_id: id of opened index.
        :param string operation: logical comparison operation to use over ``columns``.
            Currently allowed operations are defined in :const:`~.FIND_OPERATIONS`.
            Only one operation is allowed per call.
        :param iterable columns: list of column values for comparison operation.
            List must be ordered in the same way as columns are defined in
            opened index.
        :param string modify_operation: modification operation (update or delete).
            Currently allowed operations are defined in :const:`~.MODIFY_OPERATIONS`.
        :param iterable modify_columns: list of column values for update operation.
            List must be ordered in the same way as columns are defined in
            opened index. Only usable for *update* operation,
        :param integer limit: optional limit of results to change. Default is
            one row. In case multiple rows are expected to be changed, ``limit``
            must be set explicitly, HS wont change all found rows by default.
        :param integer offset: optional offset of rows to search for.
        :rtype: list

        """
        if operation not in self.FIND_OPERATIONS \
                or modify_operation not in self.MODIFY_OPERATIONS:
            raise ValueError('Operation is not supported.')

        if not check_columns(columns):
            raise ValueError('Columns must be a non-empty iterable.')

        if modify_operation in ('U', '+', '-', 'U?', '+?', '-?') \
            and not check_columns(modify_columns):
            raise ValueError(
                'Modify_columns must be a non-empty iterable for update operation'
            )

        query = chain((str(index_id), operation, str(len(columns))),
                      imap(encode, columns),
                      (str(limit), str(offset), modify_operation),
                      imap(encode, modify_columns))

        response = self._call(index_id, query, force_index=True)

        return response
예제 #2
0
    def find_modify(self, index_id, operation, columns, modify_operation,
                    modify_columns=[], limit=0, offset=0):
        """Updates/deletes row(s) using opened index.

        Returns number of modified rows or a list of original values in case
        ``modify_operation`` ends with ``?``.

        Raises ``ValueError`` if given data doesn't validate.

        :param integer index_id: id of opened index.
        :param string operation: logical comparison operation to use over ``columns``.
            Currently allowed operations are defined in :const:`~.FIND_OPERATIONS`.
            Only one operation is allowed per call.
        :param iterable columns: list of column values for comparison operation.
            List must be ordered in the same way as columns are defined in
            opened index.
        :param string modify_operation: modification operation (update or delete).
            Currently allowed operations are defined in :const:`~.MODIFY_OPERATIONS`.
        :param iterable modify_columns: list of column values for update operation.
            List must be ordered in the same way as columns are defined in
            opened index. Only usable for *update* operation,
        :param integer limit: optional limit of results to change. Default is
            one row. In case multiple rows are expected to be changed, ``limit``
            must be set explicitly, HS wont change all found rows by default.
        :param integer offset: optional offset of rows to search for.
        :rtype: list

        """
        if operation not in self.FIND_OPERATIONS \
                or modify_operation not in self.MODIFY_OPERATIONS:
            raise ValueError('Operation is not supported.')

        if not check_columns(columns):
            raise ValueError('Columns must be a non-empty iterable.')

        if modify_operation in ('U', '+', '-', 'U?', '+?', '-?') \
            and not check_columns(modify_columns):
            raise ValueError('Modify_columns must be a non-empty iterable for update operation')

        query = chain(
            (str(index_id), operation, str(len(columns))),
            imap(encode, columns),
            (str(limit), str(offset), modify_operation),
            imap(encode, modify_columns)
        )

        response = self._call(index_id, query, force_index=True)

        return response
예제 #3
0
    def find(self, index_id, operation, columns, limit=0, offset=0):
        """Finds row(s) via opened index.

        Raises ``ValueError`` if given data doesn't validate.

        :param integer index_id: id of opened index.
        :param string operation: logical comparison operation to use over ``columns``.
            Currently allowed operations are defined in :const:`~.FIND_OPERATIONS`.
            Only one operation is allowed per call.
        :param iterable columns: list of column values for comparison operation.
            List must be ordered in the same way as columns are defined
            in opened index.
        :param integer limit: optional limit of results to return. Default is
            one row. In case multiple results are expected, ``limit`` must be
            set explicitly, HS wont return all found rows by default.
        :param integer offset: optional offset of rows to search for.
        :rtype: list
        """
        if operation not in self.FIND_OPERATIONS:
            raise ValueError('Operation is not supported.')

        if not check_columns(columns):
            raise ValueError('Columns must be a non-empty iterable.')

        query = chain((str(index_id), operation, str(len(columns))),
                      imap(encode, columns), (str(limit), str(offset)))

        response = self._call(index_id, query, force_index=True)

        return response
예제 #4
0
    def find(self, index_id, operation, columns, limit=0, offset=0):
        """Finds row(s) via opened index.

        Raises ``ValueError`` if given data doesn't validate.

        :param integer index_id: id of opened index.
        :param string operation: logical comparison operation to use over ``columns``.
            Currently allowed operations are defined in :const:`~.FIND_OPERATIONS`.
            Only one operation is allowed per call.
        :param iterable columns: list of column values for comparison operation.
            List must be ordered in the same way as columns are defined
            in opened index.
        :param integer limit: optional limit of results to return. Default is
            one row. In case multiple results are expected, ``limit`` must be
            set explicitly, HS wont return all found rows by default.
        :param integer offset: optional offset of rows to search for.
        :rtype: list
        """
        if operation not in self.FIND_OPERATIONS:
            raise ValueError('Operation is not supported.')

        if not check_columns(columns):
            raise ValueError('Columns must be a non-empty iterable.')

        query = chain(
            (str(index_id), operation, str(len(columns))),
            imap(encode, columns),
            (str(limit), str(offset))
        )

        response = self._call(index_id, query, force_index=True)

        return response
def _data_preprocess(
    clip_start_date,
    clip_end_date,
    disk_smart_df,
    use_model_id,
    use_2017_fault_data,
    is_train,
):
    """
    
    """
    if use_model_id:
        disk_smart_df = disk_smart_df[disk_smart_df.model == use_model_id]
    disk_smart_df = disk_smart_df[disk_smart_df['dt'] >= clip_start_date] if clip_start_date is not None \
    else disk_smart_df
    disk_smart_df = disk_smart_df[disk_smart_df['dt'] <= clip_end_date] if clip_end_date is not None \
    else disk_smart_df

    if use_2017_fault_data:
        fault_data_2017_path = os.path.join(conf.DATA_DIR,
                                            '2017_fault_data.h5')
        fault_2017_df = pd.read_hdf(
            fault_data_2017_path,
            columns=SELECTED_CONT_COLS + SELECTED_INDEX_COLS +
            SELECTED_CATE_COLS + SELECTED_LABEL_COLS,
        )
        disk_smart_df = pd.concat([disk_smart_df, fault_2017_df], axis=0)

    # some task-specific clean rules
    correct_column_type(disk_smart_df)
    index_cols, cate_cols, cont_cols, label_cols = check_columns(
        disk_smart_df.dtypes.to_dict())
    disk_smart_df.drop_duplicates(index_cols, keep='first', inplace=True)
    mask = (disk_smart_df[POWER_ON_HOURS_COL] != 0)
    disk_smart_df = disk_smart_df[mask]
    #     disk_smart_df.dropna(subset=[POWER_ON_HOURS_COL], inplace=True)

    if is_train:
        cols_with_unique_number = remove_cont_cols_with_unique_value(
            disk_smart_df, cont_cols, threshold=DROP_UNIQUE_COL_THRESHOLD)
        disk_smart_df.drop(columns=cols_with_unique_number, inplace=True)
        drop_na_cols = check_nan_value(disk_smart_df,
                                       threshold=DROP_NAN_COL_THRESHOLD)
        disk_smart_df.drop(columns=drop_na_cols, inplace=True)
        disk_smart_df.loc[disk_smart_df[USING_LABEL] != 0,
                          USING_LABEL] = FAULT_LABEL
    return disk_smart_df
예제 #6
0
    def insert(self, index_id, columns):
        """Inserts single row using opened index.

        Raises ``ValueError`` if given data doesn't validate.

        :param integer index_id: id of opened index.
        :param list columns: list of column values for insertion. List must be
            ordered in the same way as columns are defined in opened index.
        :rtype: bool
        """
        if not check_columns(columns):
            raise ValueError('Columns must be a non-empty iterable.')

        query = chain((str(index_id), '+', str(len(columns))),
                      imap(encode, columns))

        self._call(index_id, query, force_index=True)

        return True
예제 #7
0
def show_csv_config(request):

    recordtype = 'collectionobjects'

    matrix = deepcopy(RECORDTYPES[recordtype][2][0])
    # convert from dict of tuples to list of tuples
    matrix = [[
        m,
    ] + matrix[m] for m in matrix]
    matrix = [[m[i] for i in (0, 1, 3, 6, 5)] for m in matrix]
    matrix = sorted(matrix, key=lambda x: x[4])
    #labels = 'input_column,cspace_field,context_tag,data_type,check_exists,row_id,authority or vocabulary'.split(',')
    labels = 'input column,cspace field,data type,authority or vocabulary,row id'.split(
        ',')
    columnhandling = check_columns(labels, 'none', recordtype)
    #message = "%s 'actionable' fields configured in config file." % len(matrix)
    message = "'Mappable' fields for %s" % recordtype
    rtypes = [[RECORDTYPES[r][0], r] for r in RECORDTYPES.keys()]

    return labels, matrix, message, rtypes
예제 #8
0
    def insert(self, index_id, columns):
        """Inserts single row using opened index.

        Raises ``ValueError`` if given data doesn't validate.

        :param integer index_id: id of opened index.
        :param list columns: list of column values for insertion. List must be
            ordered in the same way as columns are defined in opened index.
        :rtype: bool
        """
        if not check_columns(columns):
            raise ValueError('Columns must be a non-empty iterable.')

        query = chain(
            (str(index_id), '+', str(len(columns))),
            imap(encode, columns)
        )

        self._call(index_id, query, force_index=True)

        return True
def feature_engineering(filename='',
                        fe_save_filename='train_fe.feather',
                        is_train=True,
                        use_2017_fault_data=False,
                        clip_start_date=None,
                        clip_end_date=None,
                        pred_start_date='2018-09-01',
                        pred_end_date='2018-09-30',
                        use_model_id=None,
                        num_processes=10):
    """
    
    :return:
    """
    logger.info('训练数据特征工程: %s,数据集截断起始日期:%s, 数据集截断结束日期:%s' %
                (is_train, clip_start_date, clip_end_date))

    # load dataset
    disk_smart_df = _load_data_into_dataframe(filename, is_train)

    # preprocess data
    disk_smart_df = _data_preprocess(clip_start_date, clip_end_date,
                                     disk_smart_df, use_model_id,
                                     use_2017_fault_data, is_train)
    """generate cate feats"""
    fe_df = disk_smart_df.copy(deep=True)
    del disk_smart_df
    gc.collect()

    #
    fe_df['model_type'] = fe_df['model'].map({1: 0, 2: 1}).astype('category')
    for col in TRANSFORM_CONT_INTO_CAT_COLS:
        fe_df[col + '_cate'] = 0
        fe_df.loc[fe_df[col] > 0, col + '_cate'] = 1
        fe_df[col + '_cate'] = fe_df[col + '_cate'].astype('category')

    fe_df['power_on_hours_in_day_unit'] = fe_df[POWER_ON_HOURS_COL] // 24
    if is_train:
        fe_df['power_on_hours_in_day_unit_cate'] = pd.cut(
            fe_df['power_on_hours_in_day_unit'],
            bins=BINS_FOR_CUT_POWER_ON_HOURS_FEAT,
            labels=False)
    else:
        fe_df['power_on_hours_in_day_unit_cate'] = pd.cut(
            fe_df['power_on_hours_in_day_unit'],
            POWER_ON_HOURS_CATE_LIST,
            labels=False)
    fe_df['power_on_hours_in_day_unit_cate'] = fe_df[
        'power_on_hours_in_day_unit_cate'].astype('category')
    fe_df.drop(columns=['power_on_hours_in_day_unit'], inplace=True)
    """ generate cont feats"""
    # error weight combination features
    _get_combination_weight(fe_df, ERR_RECORD_COLS, 'err_weight')
    _get_combination_weight(fe_df, SEEK_ERR_COLS, 'seek_err_weight')
    _get_combination_weight(fe_df, DEGRADATION_ERR_COLS,
                            'degradation_err_weight')
    fe_df.drop(columns=TRANSFORM_CONT_INTO_CAT_COLS, inplace=True)

    # sliding window feature
    group_cols = ['model', 'serial_number']
    index_cols, cate_cols, cont_cols, label_cols = check_columns(
        fe_df.dtypes.to_dict())

    if is_train:
        label_df = fe_df[
            index_cols +
            label_cols]  # for further joining with feature engineered data
    fe_df = fe_df[index_cols + cate_cols + cont_cols]
    fe_df = _sliding_window(fe_df, group_cols, cont_cols, cate_cols,
                            num_processes)

    # drop the col with too many nan
    if is_train:
        drop_na_cols = check_nan_value(fe_df, threshold=DROP_NAN_COL_THRESHOLD)
        fe_df.drop(columns=drop_na_cols, inplace=True)

    if is_train:
        fe_df = _merge_label_df_and_fe_df(
            label_df, fe_df, index_cols)  # get the label cols back
        del label_df
        gc.collect()
        fe_df.reset_index(drop=True, inplace=True)
        save_path = os.path.join(conf.DATA_DIR, fe_save_filename)
        fe_df.to_feather(save_path)
        logger.info('特征工程文件文件已保存至%s' % save_path)
    else:
        # get the prediction duration for predict data
        mask = fe_df.dt >= pred_start_date
        mask &= fe_df.dt <= pred_end_date
        fe_df = fe_df[mask]
        fe_df.reset_index(drop=True, inplace=True)
    return fe_df