Python get_data_hdf示例，dquality.common.utilities.get_data_hdf Python示例

示例#1

0

显示文件

文件： data.py 项目： decarlof/data-quality

def verify_file_hdf(logger, file, data_tags, limits, quality_checks,
                    report_type, report_dir, consumers):
    """
    This method handles verification of data in hdf type file.

    This method creates and starts a new handler process. The handler is initialized with data queue,
    the data type, and a result queue. The data type can be 'data_dark', 'data_white' or 'data'.
    After starting the process the function enqueues queue slice by slice into data, until all data is
    queued. The last enqueued element is end of the data marker.

    Parameters
    ----------
    logger: Logger
        Logger instance.

    file : str
        a filename including path that will be verified

    data_tags : dict
        a dictionary od data_type/hdf tag

    limits : dict
        a dictionary of limits values

    quality_checks : dict
        a dictinary specifying quality checks structure that will be applied to verify the data file

    report_type : int
        report type, currently supporting 'none', 'errors', and 'full'

    report_dir : str
        a directory where report files will be located

    consumers : dict
        a dictionary containing consumer processes to run, and their parameters

    Returns
    -------
    bad_indexes : dict
        a dictionary of bad indexes per data type

    """
    def process_data(data_type):
        data_tag = data_tags[data_type]
        dt = fp[data_tag]
        for i in range(0, dt.shape[0]):
            data = Data(const.DATA_STATUS_DATA, dt[i], data_type)
            dataq.put(data)
            # add delay to slow down flow up, so the flow down (results)
            # are handled in synch
            time.sleep(.1)

    fp, tags = utils.get_data_hdf(file)
    dataq = Queue()
    aggregateq = Queue()

    p = Process(target=handler.handle_data,
                args=(dataq, limits, aggregateq, quality_checks, consumers))
    p.start()

    # assume a fixed order of data types; this will determine indexes on the data
    if 'data_dark' in data_tags:
        process_data('data_dark')
    if 'data_white' in data_tags:
        process_data('data_white')
    if 'data' in data_tags:
        process_data('data')

    dataq.put(Data(const.DATA_STATUS_END))

    if report_type != const.REPORT_NONE:
        if report_dir is None:
            report_file = file.rsplit(".", )[0] + '.report'
        else:
            file = file.rsplit(".", )[0]
            file_path = file.rsplit("/", )
            report_file = report_dir + "/" + file_path[len(file_path -
                                                           1)] + '.report'

    # receive the results
    bad_indexes = {}
    aggregate = aggregateq.get()

    if report_file is not None:
        report.report_results(logger, aggregate, None, report_file,
                              report_type)
    report.add_bad_indexes(aggregate, bad_indexes)

    logger.info('data verifier evaluated ' + file + ' file')
    return bad_indexes

示例#2

0

显示文件

文件： accumulator.py 项目： AdvancedPhotonSource/data-quality

def verify(conf, folder, data_type, num_files, report_by_files=True):
    """
    This function discovers new files and evaluates data in the files.

    This is the main function called when the verifier application starts.
    It reads the configuration for the directory to monitor, for pattern
    that represents a file extension to look for, and for a number of
    files that are expected for the experiment. The number of files
    configuration parameter is added for experiments that generate
    multiple files. In some cases the experiment data is collected
    into a single file, which is organized with data sets.

    The function calls directory function that sets up the monitoring
    and returns notifier. After the monitoring is initialized, it starts
    a loop that reads the global "*files*" queue and then the global
    "*results*" queue. If there is any new file, the file is removed
    from the queue, and the data in the file is validated by a sequence
    of validation methods. If there is any new result, the result is
    removed from the queue, corresponding process is terminated, and
    the result is presented. (currently printed on console, later will
    be pushed into an EPICS process variable)

    The loop is interrupted when all expected processes produced results.
    The number of expected processes is determined by number of files and
    number of validation functions.

    Parameters
    ----------
    conf : str
        configuration file name, including path

    folder : str
        monitored directory

    data_type : str
        defines which data type is being evaluated

    num_files : int
        number of files that will be processed

    report_by_files : boolean
        this variable directs how to present the bad indexes in a report. If True, the indexes
        are related to the files, and a filename is included in the report. Otherwise, the
        report contains a list of bad indexes.

    Returns
    -------
    bad_indexes : dict
        a dictionary or list containing bad indexes

    """
    logger, limits, quality_checks, extensions, report_type, consumers = init(
        conf)
    if not os.path.isdir(folder):
        logger.error('parameter error: directory ' + folder +
                     ' does not exist')
        sys.exit(-1)

    notifier = directory(folder, extensions)

    interrupted = False
    file_list = []
    offset_list = []
    dataq = Queue()
    aggregateq = Queue()
    p = Process(target=datahandler.handle_data,
                args=(dataq, limits, aggregateq, quality_checks, consumers))
    p.start()

    file_index = 0
    slice_index = 0
    while not interrupted:
        # The notifier will put a new file into a newFiles queue if one was
        # detected
        notifier.process_events()
        if notifier.check_events():
            notifier.read_events()

        # checking the newFiles queue for new entries and starting verification
        # processes for each new file
        while not files.empty():
            file = files.get()
            if file.find('INTERRUPT') >= 0:
                # the calling function may use a 'interrupt' command to stop the monitoring
                # and processing.
                dataq.put(Data(const.DATA_STATUS_END))
                notifier.stop()
                interrupted = True
                break
            else:
                if file_index == 0:
                    report_file = file.rsplit(".", )[0] + '.report'
                fp, tags = utils.get_data_hdf(file)
                data_tag = tags['/exchange/' + data_type]
                data = np.asarray(fp[data_tag])
                slice_index += data.shape[0]
                file_list.append(file)
                offset_list.append(slice_index)
                for i in range(0, data.shape[0]):
                    dataq.put(Data(const.DATA_STATUS_DATA, data[i], data_type))
                file_index += 1
                if file_index == num_files:
                    dataq.put(Data(const.DATA_STATUS_END))
                    notifier.stop()
                    interrupted = True
                    break

    aggregate = aggregateq.get()

    #report.report_results(logger, aggregate, data_type, None, report_file, report_type)

    bad_indexes = {}
    if report_by_files == 'True':
        report.add_bad_indexes_per_file(aggregate, bad_indexes, file_list,
                                        offset_list)
    else:
        report.add_bad_indexes(aggregate, bad_indexes)
    try:
        report_file = open(report_file, 'w')
        report.report_bad_indexes(bad_indexes, report_file)
    except:
        logger.warning('Cannot open report file')

    return bad_indexes

示例#3

0

显示文件

文件： data.py 项目： bfrosik/data-quality

def verify_file_hdf(logger, file, data_tags, limits, quality_checks, report_type, report_dir):
    """
    This method handles verification of hdf type file.
    This method creates and starts a new handler process. The handler is initialized with data queue,
    the data type, and a result queue. The data type can be 'data_dark', 'data_white' or 'data'.
    After starting the process the function enqueues queue slice by slice into data, until all data is
    queued. The last enqueued element is end of the data marker.

    Parameters
    ----------
    logger: Logger
        Logger instance.

    file : str
        a filename including path that will be verified

    data_tags : dict
        a dictionary od data_type/hdf tag

    limits : dict
        a dictionary of limits values

    quality_checks : dict
        a dictinary specifying quality checks structure that will be applied to verify the data file

    report_type : int
        report type, currently supporting 'none', 'errors', and 'full'

    report_dir : str
        a directory where report files will be located

    Returns
    -------
    bad_indexes : dict
        a dictionary of bad indexes per data type

    """
    fp, tags = utils.get_data_hdf(file)

    queues = {}
    bad_indexes = {}

    for type in data_tags.keys():
        data_tag = data_tags[type]
        if data_tag in tags:
            queue = Queue()
            queues[type] = queue
            process_data(type, queue, fp, data_tag, limits, quality_checks)

    if report_type != const.REPORT_NONE:
        if report_dir is None:
            report_file = file.rsplit(".",)[0] + '.report'
        else:
            file = file.rsplit(".",)[0]
            file_path = file.rsplit("/",)
            report_file = report_dir + "/" + file_path[len(file_path-1)]+ '.report'

    # receive the results
    for type in queues.keys():
        queue = queues[type]
        aggregate = queue.get()
        report.add_bad_indexes(aggregate, type, bad_indexes)
        if report_file is not None:
            report.report_results(logger, aggregate, type, file, report_file, report_type)

    logger.info('data verifier evaluated ' + file + ' file')

    return bad_indexes

示例#4

0

显示文件

文件： accumulator.py 项目： decarlof/data-quality

def verify(conf, folder, data_type, num_files, report_by_files=True):
    """
    This function discovers new files and evaluates data in the files.

    This is the main function called when the verifier application starts.
    It reads the configuration for the directory to monitor, for pattern
    that represents a file extension to look for, and for a number of
    files that are expected for the experiment. The number of files
    configuration parameter is added for experiments that generate
    multiple files. In some cases the experiment data is collected
    into a single file, which is organized with data sets.

    The function calls directory function that sets up the monitoring
    and returns notifier. After the monitoring is initialized, it starts
    a loop that reads the global "*files*" queue and then the global
    "*results*" queue. If there is any new file, the file is removed
    from the queue, and the data in the file is validated by a sequence
    of validation methods. If there is any new result, the result is
    removed from the queue, corresponding process is terminated, and
    the result is presented. (currently printed on console, later will
    be pushed into an EPICS process variable)

    The loop is interrupted when all expected processes produced results.
    The number of expected processes is determined by number of files and
    number of validation functions.

    Parameters
    ----------
    conf : str
        configuration file name, including path

    folder : str
        monitored directory

    data_type : str
        defines which data type is being evaluated

    num_files : int
        number of files that will be processed

    report_by_files : boolean
        this variable directs how to present the bad indexes in a report. If True, the indexes
        are related to the files, and a filename is included in the report. Otherwise, the
        report contains a list of bad indexes.

    Returns
    -------
    bad_indexes : dict
        a dictionary or list containing bad indexes

    """
    logger, limits, quality_checks, extensions, report_type, consumers = init(conf)
    if not os.path.isdir(folder):
        logger.error(
            'parameter error: directory ' +
            folder + ' does not exist')
        sys.exit(-1)

    notifier = directory(folder, extensions)

    interrupted = False
    file_list = []
    offset_list = []
    dataq = Queue()
    aggregateq = Queue()
    p = Process(target=datahandler.handle_data, args=(dataq, limits, aggregateq, quality_checks, consumers))
    p.start()

    file_index = 0
    slice_index = 0
    while not interrupted:
        # The notifier will put a new file into a newFiles queue if one was
        # detected
        notifier.process_events()
        if notifier.check_events():
            notifier.read_events()

        # checking the newFiles queue for new entries and starting verification
        # processes for each new file
        while not files.empty():
            file = files.get()
            if file.find('INTERRUPT') >= 0:
                # the calling function may use a 'interrupt' command to stop the monitoring
                # and processing.
                dataq.put(Data(const.DATA_STATUS_END))
                notifier.stop()
                interrupted = True
                break
            else:
                if file_index == 0:
                    report_file = file.rsplit(".",)[0] + '.report'
                fp, tags = utils.get_data_hdf(file)
                data_tag = tags['/exchange/'+data_type]
                data = np.asarray(fp[data_tag])
                slice_index += data.shape[0]
                file_list.append(file)
                offset_list.append(slice_index)
                for i in range(0, data.shape[0]):
                    dataq.put(Data(const.DATA_STATUS_DATA, data[i], data_type))
                file_index += 1
                if file_index == num_files:
                    dataq.put(Data(const.DATA_STATUS_END))
                    notifier.stop()
                    interrupted = True
                    break

    aggregate = aggregateq.get()

    #report.report_results(logger, aggregate, data_type, None, report_file, report_type)

    bad_indexes = {}
    if report_by_files == 'True':
        report.add_bad_indexes_per_file(aggregate, bad_indexes, file_list, offset_list)
    else:
        report.add_bad_indexes(aggregate, bad_indexes)
    try:
        report_file = open(report_file, 'w')
        report.report_bad_indexes(bad_indexes, report_file)
    except:
        logger.warning('Cannot open report file')


    return bad_indexes

示例#5

0

显示文件

文件： data.py 项目： decarlof/data-quality

def verify_file_hdf(logger, file, data_tags, limits, quality_checks, report_type, report_dir, consumers):
    """
    This method handles verification of data in hdf type file.

    This method creates and starts a new handler process. The handler is initialized with data queue,
    the data type, and a result queue. The data type can be 'data_dark', 'data_white' or 'data'.
    After starting the process the function enqueues queue slice by slice into data, until all data is
    queued. The last enqueued element is end of the data marker.

    Parameters
    ----------
    logger: Logger
        Logger instance.

    file : str
        a filename including path that will be verified

    data_tags : dict
        a dictionary od data_type/hdf tag

    limits : dict
        a dictionary of limits values

    quality_checks : dict
        a dictinary specifying quality checks structure that will be applied to verify the data file

    report_type : int
        report type, currently supporting 'none', 'errors', and 'full'

    report_dir : str
        a directory where report files will be located

    consumers : dict
        a dictionary containing consumer processes to run, and their parameters

    Returns
    -------
    bad_indexes : dict
        a dictionary of bad indexes per data type

    """
    def process_data(data_type):
        data_tag = data_tags[data_type]
        dt = fp[data_tag]
        for i in range(0,dt.shape[0]):
            data = Data(const.DATA_STATUS_DATA, dt[i], data_type)
            dataq.put(data)
            # add delay to slow down flow up, so the flow down (results)
            # are handled in synch
            time.sleep(.1)

    fp, tags = utils.get_data_hdf(file)
    dataq = Queue()
    aggregateq = Queue()

    p = Process(target=handler.handle_data, args=(dataq, limits, aggregateq, quality_checks, consumers))
    p.start()

    # assume a fixed order of data types; this will determine indexes on the data
    if 'data_dark' in data_tags:
        process_data('data_dark')
    if 'data_white' in data_tags:
        process_data('data_white')
    if 'data' in data_tags:
        process_data('data')

    dataq.put(Data(const.DATA_STATUS_END))


    if report_type != const.REPORT_NONE:
        if report_dir is None:
            report_file = file.rsplit(".",)[0] + '.report'
        else:
            file = file.rsplit(".",)[0]
            file_path = file.rsplit("/",)
            report_file = report_dir + "/" + file_path[len(file_path-1)]+ '.report'

    # receive the results
    bad_indexes = {}
    aggregate = aggregateq.get()

    if report_file is not None:
        report.report_results(logger, aggregate, None, report_file, report_type)
    report.add_bad_indexes(aggregate, bad_indexes)

    logger.info('data verifier evaluated ' + file + ' file')
    return bad_indexes