Exemplo n.º 1
0
def handle_data_task(row_object, **kwargs):
    """
    handle task
    :param row_object: row object
    :param kwargs:
    :return:  The Row object that processed by task
    """
    try:
        check_row_object(row_object, **kwargs)
        _row_object = handle_data(row_object, **kwargs)
        return _row_object
    except AssertionError as e:
        row_object.status = RowStatus.ERROR.value
        nlogger.error("{fn} Params error: {e}".format(
            fn='handle_data_task', e=traceback.format_exc()))

        if hasattr(row_object, 'column_value') and isinstance(
                row_object.column_value, dict):
            row_object.column_value[
                'result'] = f'Params AssertionError: {str(e)}'
            _company_name = str(row_object.column_value.get('公司',
                                                            '未知单位')).strip()
        else:
            setattr(row_object, 'column_value',
                    {'result': f'Params AssertionError: {str(e)}'})
            _company_name = '未知单位'

        flogger.error("handle_data_task failed:{n},AssertionError:{e}".format(
            n=_company_name, e=repr(e)))
        return row_object
    except HandleDataError as e:
        row_object.status = RowStatus.ERROR.value
        _company_name = str(row_object.column_value.get('公司', '未知单位')).strip()
        flogger.error("handle_data_task failed:{n},HandleDataError:{e}".format(
            n=_company_name, e=repr(e)))
        row_object.column_value['result'] = f'HandleDataError: {str(e)}'
        return row_object
    except Exception as e:
        row_object.status = RowStatus.ERROR.value
        nlogger.error(
            "{fn} position:{p}, company name:{n}, undefined error: {e}".format(
                fn='handle_data_task',
                p=row_object.position,
                n=row_object.column_value['公司'],
                e=traceback.format_exc()))
        row_object.column_value[
            'result'] = f'handle_data_task undefined error: {str(e)}'
        _company_name = str(row_object.column_value.get('公司', '未知单位')).strip()
        flogger.error(
            "handle_data_task undefined failed:{n},HandleDataError:{e}".format(
                n=_company_name, e=repr(e)))
        return row_object
Exemplo n.º 2
0
def exec_func(check_file,
              file_name=None,
              sheet_name=None,
              start_point=None,
              end_point=None,
              **kwargs):
    """
    Executive Function
    :param check_file: if check_file is True,then only check if download file exists. default False
    :param file_name: Excel file name
    :param sheet_name: sheet name, default active sheet
    :param start_point: start row number, minimum is 2 ( row 1 is column name)
    :param end_point: end row number , maximum is the row number of sheet
    :param kwargs:
    :return:
    """
    try:
        # Construct dictionary of company
        _dict_file_name = check_file_name(SAMPLES_FILE, **kwargs)
        # _dict_file_name = check_file_name('会员单位名单.xlsx', **kwargs)
        _dict_xls, _dict_row_object_iterator = get_row_object_iterator(
            check_file, _dict_file_name, 'listing', **kwargs)
        samples_object = get_samples_object(_dict_row_object_iterator,
                                            **kwargs)
        nlogger.info(f"get_samples_object has been completed")

        # Prepare source data
        _data_file_name = check_file_name(file_name, **kwargs)
        _data_xls, _data_row_object_iterator = get_row_object_iterator(
            check_file, _data_file_name, sheet_name, start_point, end_point,
            **kwargs)
        nlogger.info(f"handle_data_thread start")
        _data_result = handle_data_thread(
            row_object_iterator=_data_row_object_iterator,
            samples_object=samples_object,
            **kwargs)
        nlogger.info(f"write_result_to_xls start")
        write_result_to_xls(_data_xls, _data_result)
    except (GetRowIterError, HandleDataError, ThreadTaskError,
            WriteResultError) as e:
        nlogger.error('{fn} Custom error: {e}'.format(fn='exec_func',
                                                      e=repr(e)))
        print(f'Custom error: {repr(e)}')
    except AssertionError as e:
        nlogger.error('{fn} Assertion error: {e}'.format(
            fn='exec_func', e=traceback.format_exc()))
        print(repr(e))
    except Exception as e:
        nlogger.error('{fn} error: {e}'.format(fn='exec_func',
                                               e=traceback.format_exc()))
        print(f'Undefined error: {repr(e)}')
Exemplo n.º 3
0
def handle_data_thread(row_object_iterator, **kwargs):
    """
    handle data by multi Thread
    :param row_object_iterator:
    :param kwargs:
    :return: The result of download, it's list
    """
    executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
    try:
        data_result = []
        all_task = [
            executor.submit(handle_data_task, row_object, **kwargs)
            for row_object in row_object_iterator
        ]

        # Raise TimeoutError: If the entire result iterator could not be generated before the given timeout.
        for future in as_completed(all_task, timeout=TASK_WAITING_TIME):
            data = future.result()
            if data:
                data_result.append(data)
        nlogger.info(f'Handle data completed, {len(data_result)} rows')
        return data_result
    except TimeoutError as e:
        nlogger.error("{fn} TimeoutError: {e}".format(fn='handle_data_thread',
                                                      e=repr(e)))
        executor.shutdown(wait=True)  # 等待future 任务都执行完成后再关闭资源
        raise ThreadTaskError('{fn} TimeoutError: {e}'.format(
            fn='handle_data_thread', e=repr(e)))
    except Exception as e:
        nlogger.error("{fn} error: {e}".format(fn='handle_data_thread',
                                               e=traceback.format_exc()))
        flogger.error("{fn} error: {e}".format(fn='handle_data_thread',
                                               e=repr(e)))
        raise ThreadTaskError('{fn} error: {e}'.format(fn='handle_data_thread',
                                                       e=repr(e)))
Exemplo n.º 4
0
def get_row_object_iterator(check_file,
                            file_name,
                            sheet_name=None,
                            start_point=None,
                            end_point=None,
                            **kwargs):
    """
    get iterator of row object
    :param check_file: True or False
    :param file_name:
    :param sheet_name:
    :param start_point:
    :param end_point:
    :param kwargs:
    :return: instance of HandleXLSX object, iterator of row object
    """
    try:
        source_xls = HandleXLSX(file_name, sheet_name)
        row_object_iterator = source_xls.generate_row_object_iterator(
            check_file, sheet_name, start_point, end_point, **kwargs)
        return source_xls, row_object_iterator
    except Exception as e:
        nlogger.error('{fn} error: {e}'.format(fn='get_row_object_iterator',
                                               e=traceback.format_exc()))
        raise GetRowIterError('{fn} error: {e}'.format(
            fn='get_row_object_iterator', e=repr(e)))
Exemplo n.º 5
0
def handle_data(row_object, **kwargs):
    """
    Get absolute patch of file storage
    :param row_object:
    :param kwargs:
    :return:
    """
    try:
        # kwargs['company_name'] = str(row_object.column_value.get('公司')).strip()
        _source_company_name = str(row_object.column_value.get('公司')).strip()
        kwargs['company_name'] = extract_company_name(_source_company_name)
        assert kwargs.get('company_name'), "company name is invalid"
        assert kwargs.get('samples_object'), "company dict is invalid"

        _row_object = set_row_object_company_info(row_object, **kwargs)

        return _row_object
    except AssertionError:
        raise
    except Exception as e:
        nlogger.error("{fn} error: {e}".format(fn='handle_data',
                                               e=traceback.format_exc()))
        raise HandleDataError("{fn} error: {e}".format(fn='handle_data',
                                                       e=repr(e)))
Exemplo n.º 6
0
def add_sample_object_property(samples_object, row_object, **kwargs):
    try:
        full_name = str(row_object.column_value.get('单位名称')).strip()
        abbr_name = str(row_object.column_value.get('简称')).strip()
        assert full_name, f"单位名称 can't be empty"
        assert abbr_name, f"简称 can't be empty"
        brief_name = simplify_company_name(full_name)

        # sort name type
        _name_type = get_name_type(row_object)

        # structure Samples property name
        _property_name = f'{_name_type}_name'
        _property_full_name = f'{_name_type}_full_name'
        _property_abbr_name = f'{_name_type}_abbr_name'

        _property_name_value = samples_object.get_property(_property_name)
        _property_full_name_value = samples_object.get_property(_property_full_name)
        _property_abbr_name_value = samples_object.get_property(_property_abbr_name)

        # add source
        _property_full_name_value[full_name] = {'termination': True,
                                                'full_name': row_object.column_value.get('单位名称'),
                                                'company_type': row_object.column_value.get('单位类别')}
        samples_object.set_property(_property_full_name, _property_full_name_value)

        # add source reference
        _property_name_value[full_name] = {_property_full_name: full_name}
        samples_object.set_property(_property_name, _property_name_value)

        samples_object.all_full_name[full_name] = {_property_full_name: full_name}
        samples_object.all_name[full_name] = {_property_full_name: full_name}

        _property_abbr_name_value[abbr_name] = {_property_full_name: full_name}
        samples_object.set_property(_property_abbr_name, _property_abbr_name_value)

        _property_name_value[abbr_name] = {_property_full_name: full_name}
        samples_object.set_property(_property_name, _property_name_value)

        samples_object.all_abbr_name[abbr_name] = {_property_full_name: full_name}
        samples_object.all_name[abbr_name] = {_property_full_name: full_name}

        if brief_name != abbr_name:
            _property_abbr_name_value[brief_name] = {_property_full_name: full_name}
            samples_object.set_property(_property_abbr_name, _property_abbr_name_value)

            _property_name_value[brief_name] = {_property_full_name: full_name}
            samples_object.set_property(_property_name, _property_name_value)

            samples_object.all_abbr_name[brief_name] = {_property_full_name: full_name}
            samples_object.all_name[brief_name] = {_property_full_name: full_name}

        return samples_object
    except Exception as e:
        nlogger.error('{fn} Undefined error: {e}'.format(fn='add_sample_object_property', e=traceback.format_exc()))
        print(f'Undefined error: {repr(e)}')
        raise
Exemplo n.º 7
0
def write_result_to_xls(source_xls, data_result):
    """
    Write result in sheet of Excel
    :param source_xls:
    :param data_result:
    :return:
    """
    try:
        column_name_list = source_xls.get_column_name_list()
        columns_number = len(column_name_list)

        y_result = recursive_get_index(column_name_list, 'result') + 1
        if y_result > columns_number:
            source_xls.write_sheet_rows_value(
                sheet_name=source_xls.sheet.title,
                values=(1, y_result, 'result'))

        y_name = recursive_get_index(column_name_list, 'name') + 1
        if y_name > columns_number:
            source_xls.write_sheet_rows_value(
                sheet_name=source_xls.sheet.title, values=(1, y_name, 'name'))

        y_guess_name = recursive_get_index(column_name_list, 'guess_name') + 1
        if y_guess_name > columns_number:
            source_xls.write_sheet_rows_value(
                sheet_name=source_xls.sheet.title,
                values=(1, y_guess_name, 'guess_name'))

        y_full_name = recursive_get_index(column_name_list, 'full_name') + 1
        if y_full_name > columns_number:
            source_xls.write_sheet_rows_value(
                sheet_name=source_xls.sheet.title,
                values=(1, y_full_name, 'full_name'))

        y_type = recursive_get_index(column_name_list, 'type') + 1
        if y_type > columns_number:
            source_xls.write_sheet_rows_value(
                sheet_name=source_xls.sheet.title, values=(1, y_type, 'type'))

        y_similarity = recursive_get_index(column_name_list, 'similarity') + 1
        if y_similarity > columns_number:
            source_xls.write_sheet_rows_value(
                sheet_name=source_xls.sheet.title,
                values=(1, y_similarity, 'similarity'))

        for row_object in data_result:
            x = row_object.position
            y = y_result
            values = (x, y, row_object.column_value.get('result', 'unknown'))
            source_xls.write_sheet_rows_value(sheet_name=row_object.sheet_name,
                                              values=values)

            if row_object.column_value.get('company_name'):
                y = y_name
                values = (x, y, row_object.column_value.get('company_name'))
                source_xls.write_sheet_rows_value(
                    sheet_name=row_object.sheet_name, values=values)

            if row_object.column_value.get('guess_name'):
                y = y_guess_name
                values = (x, y, row_object.column_value.get('guess_name'))
                source_xls.write_sheet_rows_value(
                    sheet_name=row_object.sheet_name, values=values)

            if row_object.column_value.get('company_full_name'):
                y = y_full_name
                values = (x, y,
                          row_object.column_value.get('company_full_name'))
                source_xls.write_sheet_rows_value(
                    sheet_name=row_object.sheet_name, values=values)

            if row_object.column_value.get('company_type'):
                y = y_type
                values = (x, y, row_object.column_value.get('company_type'))
                source_xls.write_sheet_rows_value(
                    sheet_name=row_object.sheet_name, values=values)

            if row_object.column_value.get('similarity'):
                y = y_similarity
                values = (x, y, row_object.column_value.get('similarity'))
                source_xls.write_sheet_rows_value(
                    sheet_name=row_object.sheet_name, values=values)

        _result_file_name = "result_{d}.xlsx".format(
            d=datetime.now().strftime('%Y%m%d-%H:%M:%S'))
        source_xls.save(_result_file_name)
        nlogger.info(
            f'Write result completed, output file: {_result_file_name}')
    except Exception as e:
        nlogger.error("{fn} error: {e}".format(fn='write_result_to_xls',
                                               e=traceback.format_exc()))
        raise WriteResultError("{fn} error: {e}".format(
            fn='write_result_to_xls', e=repr(e)))