示例#1
0
def get_row_object_iterator(check_file,
                            file_name,
                            sheet_name=None,
                            start_point=None,
                            end_point=None,
                            **kwargs):
    """
    get iterator of row object
    :param check_file: True or False
    :param file_name:
    :param sheet_name:
    :param start_point:
    :param end_point:
    :param kwargs:
    :return: instance of HandleXLSX object, iterator of row object
    """
    try:
        source_xls = HandleXLSX(file_name, sheet_name)
        row_object_iterator = source_xls.generate_row_object_iterator(
            check_file, sheet_name, start_point, end_point, **kwargs)
        return source_xls, row_object_iterator
    except Exception as e:
        nlogger.error('{fn} error: {e}'.format(fn='get_row_object_iterator',
                                               e=traceback.format_exc()))
        raise GetRowIterError('{fn} error: {e}'.format(
            fn='get_row_object_iterator', e=repr(e)))
示例#2
0
def handle_data_thread(row_object_iterator, **kwargs):
    """
    handle data by multi Thread
    :param row_object_iterator:
    :param kwargs:
    :return: The result of download, it's list
    """
    executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
    try:
        data_result = []
        all_task = [
            executor.submit(handle_data_task, row_object, **kwargs)
            for row_object in row_object_iterator
        ]

        # Raise TimeoutError: If the entire result iterator could not be generated before the given timeout.
        for future in as_completed(all_task, timeout=TASK_WAITING_TIME):
            data = future.result()
            if data:
                data_result.append(data)
        nlogger.info(f'Handle data completed, {len(data_result)} rows')
        return data_result
    except TimeoutError as e:
        nlogger.error("{fn} TimeoutError: {e}".format(fn='handle_data_thread',
                                                      e=repr(e)))
        executor.shutdown(wait=True)  # 等待future 任务都执行完成后再关闭资源
        raise ThreadTaskError('{fn} TimeoutError: {e}'.format(
            fn='handle_data_thread', e=repr(e)))
    except Exception as e:
        nlogger.error("{fn} error: {e}".format(fn='handle_data_thread',
                                               e=traceback.format_exc()))
        flogger.error("{fn} error: {e}".format(fn='handle_data_thread',
                                               e=repr(e)))
        raise ThreadTaskError('{fn} error: {e}'.format(fn='handle_data_thread',
                                                       e=repr(e)))
示例#3
0
def add_sample_object_property(samples_object, row_object, **kwargs):
    try:
        full_name = str(row_object.column_value.get('单位名称')).strip()
        abbr_name = str(row_object.column_value.get('简称')).strip()
        assert full_name, f"单位名称 can't be empty"
        assert abbr_name, f"简称 can't be empty"
        brief_name = simplify_company_name(full_name)

        # sort name type
        _name_type = get_name_type(row_object)

        # structure Samples property name
        _property_name = f'{_name_type}_name'
        _property_full_name = f'{_name_type}_full_name'
        _property_abbr_name = f'{_name_type}_abbr_name'

        _property_name_value = samples_object.get_property(_property_name)
        _property_full_name_value = samples_object.get_property(_property_full_name)
        _property_abbr_name_value = samples_object.get_property(_property_abbr_name)

        # add source
        _property_full_name_value[full_name] = {'termination': True,
                                                'full_name': row_object.column_value.get('单位名称'),
                                                'company_type': row_object.column_value.get('单位类别')}
        samples_object.set_property(_property_full_name, _property_full_name_value)

        # add source reference
        _property_name_value[full_name] = {_property_full_name: full_name}
        samples_object.set_property(_property_name, _property_name_value)

        samples_object.all_full_name[full_name] = {_property_full_name: full_name}
        samples_object.all_name[full_name] = {_property_full_name: full_name}

        _property_abbr_name_value[abbr_name] = {_property_full_name: full_name}
        samples_object.set_property(_property_abbr_name, _property_abbr_name_value)

        _property_name_value[abbr_name] = {_property_full_name: full_name}
        samples_object.set_property(_property_name, _property_name_value)

        samples_object.all_abbr_name[abbr_name] = {_property_full_name: full_name}
        samples_object.all_name[abbr_name] = {_property_full_name: full_name}

        if brief_name != abbr_name:
            _property_abbr_name_value[brief_name] = {_property_full_name: full_name}
            samples_object.set_property(_property_abbr_name, _property_abbr_name_value)

            _property_name_value[brief_name] = {_property_full_name: full_name}
            samples_object.set_property(_property_name, _property_name_value)

            samples_object.all_abbr_name[brief_name] = {_property_full_name: full_name}
            samples_object.all_name[brief_name] = {_property_full_name: full_name}

        return samples_object
    except Exception as e:
        nlogger.error('{fn} Undefined error: {e}'.format(fn='add_sample_object_property', e=traceback.format_exc()))
        print(f'Undefined error: {repr(e)}')
        raise
示例#4
0
def handle_data_task(row_object, **kwargs):
    """
    handle task
    :param row_object: row object
    :param kwargs:
    :return:  The Row object that processed by task
    """
    try:
        check_row_object(row_object, **kwargs)
        _row_object = handle_data(row_object, **kwargs)
        return _row_object
    except AssertionError as e:
        row_object.status = RowStatus.ERROR.value
        nlogger.error("{fn} Params error: {e}".format(
            fn='handle_data_task', e=traceback.format_exc()))

        if hasattr(row_object, 'column_value') and isinstance(
                row_object.column_value, dict):
            row_object.column_value[
                'result'] = f'Params AssertionError: {str(e)}'
            _company_name = str(row_object.column_value.get('公司',
                                                            '未知单位')).strip()
        else:
            setattr(row_object, 'column_value',
                    {'result': f'Params AssertionError: {str(e)}'})
            _company_name = '未知单位'

        flogger.error("handle_data_task failed:{n},AssertionError:{e}".format(
            n=_company_name, e=repr(e)))
        return row_object
    except HandleDataError as e:
        row_object.status = RowStatus.ERROR.value
        _company_name = str(row_object.column_value.get('公司', '未知单位')).strip()
        flogger.error("handle_data_task failed:{n},HandleDataError:{e}".format(
            n=_company_name, e=repr(e)))
        row_object.column_value['result'] = f'HandleDataError: {str(e)}'
        return row_object
    except Exception as e:
        row_object.status = RowStatus.ERROR.value
        nlogger.error(
            "{fn} position:{p}, company name:{n}, undefined error: {e}".format(
                fn='handle_data_task',
                p=row_object.position,
                n=row_object.column_value['公司'],
                e=traceback.format_exc()))
        row_object.column_value[
            'result'] = f'handle_data_task undefined error: {str(e)}'
        _company_name = str(row_object.column_value.get('公司', '未知单位')).strip()
        flogger.error(
            "handle_data_task undefined failed:{n},HandleDataError:{e}".format(
                n=_company_name, e=repr(e)))
        return row_object
示例#5
0
def get_file_path(file_name, root_path, file_relative_path=None, **kwargs):
    """
    Get the file path, if not, create it.
    :param root_path: root path, absolute path
    :param file_relative_path: The relative path of the file
    :param file_name: The name of the file
    :param kwargs:
    :return: The absolute path of the file or raise exception
    """
    try:
        assert isinstance(file_name, str) and str(file_name).strip(), "Parameter file_name must be string " \
                                                                      "and not be empty"
        assert file_name.find(
            '/') == -1, "Parameter file_name cannot contain a path"
        _file_name = str(file_name).strip()

        assert isinstance(root_path, str) and str(root_path).strip(), "Parameter root_path must be string " \
                                                                      "and not be empty"
        assert root_path.startswith(
            '/') is True, "Parameter root_path must be absolute path"
        _root_path = str(root_path).strip()

        if file_relative_path is None or file_relative_path == '':
            _file_relative_path = ''
        else:
            assert isinstance(
                file_relative_path,
                str), "Parameter file_relative_path must be string"
            assert file_relative_path.startswith(
                '/'
            ) is False, "Parameter file_relative_path must be relative path"
            _file_relative_path = str(file_relative_path).strip()

        _absolute_path = os.path.join(_root_path, _file_relative_path)
        try:
            if os.path.exists(_absolute_path) is False:
                # exist_ok = True, if directory exists, no error will be reported.
                os.makedirs(_absolute_path, exist_ok=True)
        except FileExistsError as e:
            print(f'FileExistsError: {repr(e)}')
            pass

        _absolute_path_file = os.path.join(_absolute_path, _file_name)
        return _absolute_path_file
    except Exception as e:
        nlogger.error('{fn} error: {e}'.format(fn='get_file_path',
                                               e=traceback.format_exc()))
        raise UndefinedError('{fn} error: {e}'.format(fn='get_file_path',
                                                      e=repr(e)))
示例#6
0
def get_file_md5(file, file_iterator, chunk_size=4096 * 1024, **kwargs):
    """
    Get a file md5 hash.
    Default read chunk size 4MB
    """
    try:
        md5_obj = hashlib.md5()
        for data in file_iterator(file, chunk_size, **kwargs):
            md5_obj.update(data)
        _hash = md5_obj.hexdigest()
        return str(_hash).upper()
    except Exception as e:
        nlogger.error('{fn} error: {e}'.format(fn='get_file_md5',
                                               e=traceback.format_exc()))
        raise FunctionError('{fn} error: {e}'.format(fn='get_file_md5',
                                                     e=repr(e)))
示例#7
0
def file_iterator(file, encoding="utf-8"):
    """
    Lazy function to read a file line by line.
    Default encoding utf-8
    """
    try:
        if encoding is None:
            encoding = "utf-8"

        with open(file, "r", encoding=encoding) as f:
            for line in f:
                yield line
    except Exception as e:
        nlogger.error('{fn} error: {e}'.format(fn='stream_iterator',
                                               e=traceback.format_exc()))
        raise FunctionError('{fn} error: {e}'.format(fn='file_iterator',
                                                     e=repr(e)))
示例#8
0
def stream_iterator(file, chunk_size, **kwargs):
    """
    Lazy function to read a file piece by piece.
    Default chunk size: 4MB.
    """
    try:
        if chunk_size is None or not isinstance(chunk_size, int):
            chunk_size = 4096 * 1024

        with open(file, "rb") as f:
            while True:
                file_part = f.read(chunk_size)
                if file_part:
                    yield file_part
                else:
                    break
    except Exception as e:
        nlogger.error('{fn} error: {e}'.format(fn='stream_iterator',
                                               e=traceback.format_exc()))
        raise FunctionError('{fn} error: {e}'.format(fn='stream_iterator',
                                                     e=repr(e)))
示例#9
0
def handle_data(row_object, **kwargs):
    """
    Get absolute patch of file storage
    :param row_object:
    :param kwargs:
    :return:
    """
    try:
        # kwargs['company_name'] = str(row_object.column_value.get('公司')).strip()
        _source_company_name = str(row_object.column_value.get('公司')).strip()
        kwargs['company_name'] = extract_company_name(_source_company_name)
        assert kwargs.get('company_name'), "company name is invalid"
        assert kwargs.get('samples_object'), "company dict is invalid"

        _row_object = set_row_object_company_info(row_object, **kwargs)

        return _row_object
    except AssertionError:
        raise
    except Exception as e:
        nlogger.error("{fn} error: {e}".format(fn='handle_data',
                                               e=traceback.format_exc()))
        raise HandleDataError("{fn} error: {e}".format(fn='handle_data',
                                                       e=repr(e)))
示例#10
0
def exec_func(check_file,
              file_name=None,
              sheet_name=None,
              start_point=None,
              end_point=None,
              **kwargs):
    """
    Executive Function
    :param check_file: if check_file is True,then only check if download file exists. default False
    :param file_name: Excel file name
    :param sheet_name: sheet name, default active sheet
    :param start_point: start row number, minimum is 2 ( row 1 is column name)
    :param end_point: end row number , maximum is the row number of sheet
    :param kwargs:
    :return:
    """
    try:
        # Construct dictionary of company
        _dict_file_name = check_file_name(SAMPLES_FILE, **kwargs)
        # _dict_file_name = check_file_name('会员单位名单.xlsx', **kwargs)
        _dict_xls, _dict_row_object_iterator = get_row_object_iterator(
            check_file, _dict_file_name, 'listing', **kwargs)
        samples_object = get_samples_object(_dict_row_object_iterator,
                                            **kwargs)
        nlogger.info(f"get_samples_object has been completed")

        # Prepare source data
        _data_file_name = check_file_name(file_name, **kwargs)
        _data_xls, _data_row_object_iterator = get_row_object_iterator(
            check_file, _data_file_name, sheet_name, start_point, end_point,
            **kwargs)
        nlogger.info(f"handle_data_thread start")
        _data_result = handle_data_thread(
            row_object_iterator=_data_row_object_iterator,
            samples_object=samples_object,
            **kwargs)
        nlogger.info(f"write_result_to_xls start")
        write_result_to_xls(_data_xls, _data_result)
    except (GetRowIterError, HandleDataError, ThreadTaskError,
            WriteResultError) as e:
        nlogger.error('{fn} Custom error: {e}'.format(fn='exec_func',
                                                      e=repr(e)))
        print(f'Custom error: {repr(e)}')
    except AssertionError as e:
        nlogger.error('{fn} Assertion error: {e}'.format(
            fn='exec_func', e=traceback.format_exc()))
        print(repr(e))
    except Exception as e:
        nlogger.error('{fn} error: {e}'.format(fn='exec_func',
                                               e=traceback.format_exc()))
        print(f'Undefined error: {repr(e)}')
示例#11
0
def download_large_file(download_url, absolute_path_file, file_name,
                        chunk_size, retry, **kwargs):
    """
    Download large file from download url by breakpoint continuation
    :param download_url: url
    :param absolute_path_file: the file including absolute path
    :param file_name: file name
    :param chunk_size: chunk size
    :param retry: retry time
    :param kwargs:
    :return: file name or None
    """
    _temp_size = check_temp_file_exists(absolute_path_file)

    headers = {
        'Range':
        'bytes=%d-' % _temp_size,
        'Connection':
        'keep-alive',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, '
        'like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
    }

    for i in range(retry):
        try:
            with closing(
                    download_file_requester.get_url(url=download_url,
                                                    stream=True,
                                                    headers=headers)) as res:
                with open(absolute_path_file, 'ab+') as f:
                    for chunk in res.iter_content(chunk_size=chunk_size):
                        if chunk:
                            f.write(chunk)
            return f.name
        except RequestException as e:
            error_msg = f"Download failed::{file_name}, storage path is {absolute_path_file}," \
                        f"RequestException error: {repr(e)}"
            nlogger.error(error_msg)
            continue
        except Exception as e:
            error_msg = f"Download failed:{file_name}, storage path is {absolute_path_file}, undefined error: %s"
            nlogger.error(error_msg % (traceback.format_exc()))
            flogger.error(error_msg % (repr(e)))
            return
    else:
        error_msg = f"Download failed:{file_name}, storage path is {absolute_path_file}, {retry} retries failed"
        nlogger.error(error_msg)
        flogger.error(error_msg)
        return
示例#12
0
def download_small_file(download_url, absolute_path_file, file_name, retry,
                        **kwargs):
    """
    Download small file
    :param download_url:  url
    :param absolute_path_file: the file including absolute path
    :param file_name: file name
    :param retry: retry time
    :param kwargs:
    :return: download file name or None
    """
    headers = {
        'Connection':
        'keep-alive',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, '
        'like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
    }

    for i in range(retry):
        try:
            with closing(
                    download_file_requester.get_url(url=download_url,
                                                    stream=True,
                                                    headers=headers)) as res:
                with open(absolute_path_file, mode='wb') as f:
                    f.write(res.content)
            return f.name
        except RequestException as e:
            error_msg = f"Download failed::{file_name}, storage path is {absolute_path_file}," \
                        f"RequestException error: {repr(e)}"
            nlogger.error(error_msg)
            continue
        except Exception as e:
            error_msg = f"Download failed:{file_name}, storage path is {absolute_path_file}, undefined error: %s"
            nlogger.error(error_msg % (traceback.format_exc()))
            flogger.error(error_msg % (repr(e)))
            return
    else:
        error_msg = f"Download failed:{file_name}, storage path is {absolute_path_file}, {retry} retries failed"
        nlogger.error(error_msg)
        flogger.error(error_msg)
        return
示例#13
0
def write_result_to_xls(source_xls, data_result):
    """
    Write result in sheet of Excel
    :param source_xls:
    :param data_result:
    :return:
    """
    try:
        column_name_list = source_xls.get_column_name_list()
        columns_number = len(column_name_list)

        y_result = recursive_get_index(column_name_list, 'result') + 1
        if y_result > columns_number:
            source_xls.write_sheet_rows_value(
                sheet_name=source_xls.sheet.title,
                values=(1, y_result, 'result'))

        y_name = recursive_get_index(column_name_list, 'name') + 1
        if y_name > columns_number:
            source_xls.write_sheet_rows_value(
                sheet_name=source_xls.sheet.title, values=(1, y_name, 'name'))

        y_guess_name = recursive_get_index(column_name_list, 'guess_name') + 1
        if y_guess_name > columns_number:
            source_xls.write_sheet_rows_value(
                sheet_name=source_xls.sheet.title,
                values=(1, y_guess_name, 'guess_name'))

        y_full_name = recursive_get_index(column_name_list, 'full_name') + 1
        if y_full_name > columns_number:
            source_xls.write_sheet_rows_value(
                sheet_name=source_xls.sheet.title,
                values=(1, y_full_name, 'full_name'))

        y_type = recursive_get_index(column_name_list, 'type') + 1
        if y_type > columns_number:
            source_xls.write_sheet_rows_value(
                sheet_name=source_xls.sheet.title, values=(1, y_type, 'type'))

        y_similarity = recursive_get_index(column_name_list, 'similarity') + 1
        if y_similarity > columns_number:
            source_xls.write_sheet_rows_value(
                sheet_name=source_xls.sheet.title,
                values=(1, y_similarity, 'similarity'))

        for row_object in data_result:
            x = row_object.position
            y = y_result
            values = (x, y, row_object.column_value.get('result', 'unknown'))
            source_xls.write_sheet_rows_value(sheet_name=row_object.sheet_name,
                                              values=values)

            if row_object.column_value.get('company_name'):
                y = y_name
                values = (x, y, row_object.column_value.get('company_name'))
                source_xls.write_sheet_rows_value(
                    sheet_name=row_object.sheet_name, values=values)

            if row_object.column_value.get('guess_name'):
                y = y_guess_name
                values = (x, y, row_object.column_value.get('guess_name'))
                source_xls.write_sheet_rows_value(
                    sheet_name=row_object.sheet_name, values=values)

            if row_object.column_value.get('company_full_name'):
                y = y_full_name
                values = (x, y,
                          row_object.column_value.get('company_full_name'))
                source_xls.write_sheet_rows_value(
                    sheet_name=row_object.sheet_name, values=values)

            if row_object.column_value.get('company_type'):
                y = y_type
                values = (x, y, row_object.column_value.get('company_type'))
                source_xls.write_sheet_rows_value(
                    sheet_name=row_object.sheet_name, values=values)

            if row_object.column_value.get('similarity'):
                y = y_similarity
                values = (x, y, row_object.column_value.get('similarity'))
                source_xls.write_sheet_rows_value(
                    sheet_name=row_object.sheet_name, values=values)

        _result_file_name = "result_{d}.xlsx".format(
            d=datetime.now().strftime('%Y%m%d-%H:%M:%S'))
        source_xls.save(_result_file_name)
        nlogger.info(
            f'Write result completed, output file: {_result_file_name}')
    except Exception as e:
        nlogger.error("{fn} error: {e}".format(fn='write_result_to_xls',
                                               e=traceback.format_exc()))
        raise WriteResultError("{fn} error: {e}".format(
            fn='write_result_to_xls', e=repr(e)))
示例#14
0
def download_video(download_url, absolute_path_file, file_name, **kwargs):
    """
    Use wget module to download video
    :param download_url: temporary download url
    :param absolute_path_file: storage absolute path of video file
    :param file_name: file_name of video
    :param kwargs:
    :return:
    """
    try:
        _start_time = time.time()
        nlogger.info('Download start:{f}.'.format(f=absolute_path_file))

        for i in range(3):
            try:
                # Because wget.download use ulib.urlretrieve, it has no timeout
                # So set socket.setdefaulttimeout(3600) to prevent jamming
                socket.setdefaulttimeout(SOCKET_TIMEOUT)
                download_file_name = wget.download(download_url,
                                                   out=absolute_path_file,
                                                   bar=None)
                _end_time = time.time()
                nlogger.info('Download success:{f}, it takes {t:.2f}s.'.format(
                    f=absolute_path_file, t=_end_time - _start_time))
                slogger.info(
                    'Download success:{f}, its storage path is {p}.'.format(
                        f=file_name, p=absolute_path_file))

                return download_file_name
            except socket.timeout:
                error_msg = f"Download timeout: {file_name},storage path is {absolute_path_file},url is{download_url}"
                nlogger.error(error_msg)
                if i > 1:
                    raise WGetError(error_msg)
                else:
                    time.sleep(1)
                    continue
            except HTTPError as e:
                error_msg = "Download HTTPError:{0}, {1}, {2}, storage path is {3}".format(
                    file_name, download_url, e.code, absolute_path_file)
                nlogger.error(error_msg)
                if i > 1:
                    raise WGetError(error_msg)
                else:
                    time.sleep(random.randint(1, 2))
                    continue
            except URLError as e:
                error_msg = "Download URLError:{0}, {1}, {2}, storage path is {3}".format(
                    file_name, download_url, e.reason, absolute_path_file)
                nlogger.error(error_msg)
                if i > 1:
                    raise WGetError(error_msg)
                else:
                    time.sleep(random.randint(1, 2))
                    continue
            except Exception as e:
                nlogger.error(
                    'Download failed:{f}, storage path is {p},WGet error: {e}'.
                    format(f=file_name,
                           p=absolute_path_file,
                           e=traceback.format_exc()))
                raise WGetError(
                    'Download failed:{f},storage path is {p},WGet error: {e}'.
                    format(f=file_name, p=absolute_path_file, e=repr(e)))
        else:
            error_msg = 'Download failed:{f},storage path is {p},WGet retry failed'.format(
                f=file_name, p=absolute_path_file)
            nlogger.error(error_msg)
            raise WGetError(error_msg)
    except WGetError as e:
        flogger.error(repr(e))
        return
    except Exception as e:
        nlogger.error('{fn} download {f} error: {e}'.format(
            fn='download_video', f=file_name, e=traceback.format_exc()))
        flogger.error("Download failed:{f}, error: {e}".format(f=file_name,
                                                               e=repr(e)))
        return