def get_row_object_iterator(check_file, file_name, sheet_name=None, start_point=None, end_point=None, **kwargs): """ get iterator of row object :param check_file: True or False :param file_name: :param sheet_name: :param start_point: :param end_point: :param kwargs: :return: instance of HandleXLSX object, iterator of row object """ try: source_xls = HandleXLSX(file_name, sheet_name) row_object_iterator = source_xls.generate_row_object_iterator( check_file, sheet_name, start_point, end_point, **kwargs) return source_xls, row_object_iterator except Exception as e: nlogger.error('{fn} error: {e}'.format(fn='get_row_object_iterator', e=traceback.format_exc())) raise GetRowIterError('{fn} error: {e}'.format( fn='get_row_object_iterator', e=repr(e)))
def handle_data_thread(row_object_iterator, **kwargs): """ handle data by multi Thread :param row_object_iterator: :param kwargs: :return: The result of download, it's list """ executor = ThreadPoolExecutor(max_workers=MAX_WORKERS) try: data_result = [] all_task = [ executor.submit(handle_data_task, row_object, **kwargs) for row_object in row_object_iterator ] # Raise TimeoutError: If the entire result iterator could not be generated before the given timeout. for future in as_completed(all_task, timeout=TASK_WAITING_TIME): data = future.result() if data: data_result.append(data) nlogger.info(f'Handle data completed, {len(data_result)} rows') return data_result except TimeoutError as e: nlogger.error("{fn} TimeoutError: {e}".format(fn='handle_data_thread', e=repr(e))) executor.shutdown(wait=True) # 等待future 任务都执行完成后再关闭资源 raise ThreadTaskError('{fn} TimeoutError: {e}'.format( fn='handle_data_thread', e=repr(e))) except Exception as e: nlogger.error("{fn} error: {e}".format(fn='handle_data_thread', e=traceback.format_exc())) flogger.error("{fn} error: {e}".format(fn='handle_data_thread', e=repr(e))) raise ThreadTaskError('{fn} error: {e}'.format(fn='handle_data_thread', e=repr(e)))
def add_sample_object_property(samples_object, row_object, **kwargs): try: full_name = str(row_object.column_value.get('单位名称')).strip() abbr_name = str(row_object.column_value.get('简称')).strip() assert full_name, f"单位名称 can't be empty" assert abbr_name, f"简称 can't be empty" brief_name = simplify_company_name(full_name) # sort name type _name_type = get_name_type(row_object) # structure Samples property name _property_name = f'{_name_type}_name' _property_full_name = f'{_name_type}_full_name' _property_abbr_name = f'{_name_type}_abbr_name' _property_name_value = samples_object.get_property(_property_name) _property_full_name_value = samples_object.get_property(_property_full_name) _property_abbr_name_value = samples_object.get_property(_property_abbr_name) # add source _property_full_name_value[full_name] = {'termination': True, 'full_name': row_object.column_value.get('单位名称'), 'company_type': row_object.column_value.get('单位类别')} samples_object.set_property(_property_full_name, _property_full_name_value) # add source reference _property_name_value[full_name] = {_property_full_name: full_name} samples_object.set_property(_property_name, _property_name_value) samples_object.all_full_name[full_name] = {_property_full_name: full_name} samples_object.all_name[full_name] = {_property_full_name: full_name} _property_abbr_name_value[abbr_name] = {_property_full_name: full_name} samples_object.set_property(_property_abbr_name, _property_abbr_name_value) _property_name_value[abbr_name] = {_property_full_name: full_name} samples_object.set_property(_property_name, _property_name_value) samples_object.all_abbr_name[abbr_name] = {_property_full_name: full_name} samples_object.all_name[abbr_name] = {_property_full_name: full_name} if brief_name != abbr_name: _property_abbr_name_value[brief_name] = {_property_full_name: full_name} samples_object.set_property(_property_abbr_name, _property_abbr_name_value) _property_name_value[brief_name] = {_property_full_name: full_name} samples_object.set_property(_property_name, _property_name_value) samples_object.all_abbr_name[brief_name] = {_property_full_name: full_name} samples_object.all_name[brief_name] = {_property_full_name: full_name} return samples_object except Exception as e: nlogger.error('{fn} Undefined error: {e}'.format(fn='add_sample_object_property', e=traceback.format_exc())) print(f'Undefined error: {repr(e)}') raise
def handle_data_task(row_object, **kwargs): """ handle task :param row_object: row object :param kwargs: :return: The Row object that processed by task """ try: check_row_object(row_object, **kwargs) _row_object = handle_data(row_object, **kwargs) return _row_object except AssertionError as e: row_object.status = RowStatus.ERROR.value nlogger.error("{fn} Params error: {e}".format( fn='handle_data_task', e=traceback.format_exc())) if hasattr(row_object, 'column_value') and isinstance( row_object.column_value, dict): row_object.column_value[ 'result'] = f'Params AssertionError: {str(e)}' _company_name = str(row_object.column_value.get('公司', '未知单位')).strip() else: setattr(row_object, 'column_value', {'result': f'Params AssertionError: {str(e)}'}) _company_name = '未知单位' flogger.error("handle_data_task failed:{n},AssertionError:{e}".format( n=_company_name, e=repr(e))) return row_object except HandleDataError as e: row_object.status = RowStatus.ERROR.value _company_name = str(row_object.column_value.get('公司', '未知单位')).strip() flogger.error("handle_data_task failed:{n},HandleDataError:{e}".format( n=_company_name, e=repr(e))) row_object.column_value['result'] = f'HandleDataError: {str(e)}' return row_object except Exception as e: row_object.status = RowStatus.ERROR.value nlogger.error( "{fn} position:{p}, company name:{n}, undefined error: {e}".format( fn='handle_data_task', p=row_object.position, n=row_object.column_value['公司'], e=traceback.format_exc())) row_object.column_value[ 'result'] = f'handle_data_task undefined error: {str(e)}' _company_name = str(row_object.column_value.get('公司', '未知单位')).strip() flogger.error( "handle_data_task undefined failed:{n},HandleDataError:{e}".format( n=_company_name, e=repr(e))) return row_object
def get_file_path(file_name, root_path, file_relative_path=None, **kwargs): """ Get the file path, if not, create it. :param root_path: root path, absolute path :param file_relative_path: The relative path of the file :param file_name: The name of the file :param kwargs: :return: The absolute path of the file or raise exception """ try: assert isinstance(file_name, str) and str(file_name).strip(), "Parameter file_name must be string " \ "and not be empty" assert file_name.find( '/') == -1, "Parameter file_name cannot contain a path" _file_name = str(file_name).strip() assert isinstance(root_path, str) and str(root_path).strip(), "Parameter root_path must be string " \ "and not be empty" assert root_path.startswith( '/') is True, "Parameter root_path must be absolute path" _root_path = str(root_path).strip() if file_relative_path is None or file_relative_path == '': _file_relative_path = '' else: assert isinstance( file_relative_path, str), "Parameter file_relative_path must be string" assert file_relative_path.startswith( '/' ) is False, "Parameter file_relative_path must be relative path" _file_relative_path = str(file_relative_path).strip() _absolute_path = os.path.join(_root_path, _file_relative_path) try: if os.path.exists(_absolute_path) is False: # exist_ok = True, if directory exists, no error will be reported. os.makedirs(_absolute_path, exist_ok=True) except FileExistsError as e: print(f'FileExistsError: {repr(e)}') pass _absolute_path_file = os.path.join(_absolute_path, _file_name) return _absolute_path_file except Exception as e: nlogger.error('{fn} error: {e}'.format(fn='get_file_path', e=traceback.format_exc())) raise UndefinedError('{fn} error: {e}'.format(fn='get_file_path', e=repr(e)))
def get_file_md5(file, file_iterator, chunk_size=4096 * 1024, **kwargs): """ Get a file md5 hash. Default read chunk size 4MB """ try: md5_obj = hashlib.md5() for data in file_iterator(file, chunk_size, **kwargs): md5_obj.update(data) _hash = md5_obj.hexdigest() return str(_hash).upper() except Exception as e: nlogger.error('{fn} error: {e}'.format(fn='get_file_md5', e=traceback.format_exc())) raise FunctionError('{fn} error: {e}'.format(fn='get_file_md5', e=repr(e)))
def file_iterator(file, encoding="utf-8"): """ Lazy function to read a file line by line. Default encoding utf-8 """ try: if encoding is None: encoding = "utf-8" with open(file, "r", encoding=encoding) as f: for line in f: yield line except Exception as e: nlogger.error('{fn} error: {e}'.format(fn='stream_iterator', e=traceback.format_exc())) raise FunctionError('{fn} error: {e}'.format(fn='file_iterator', e=repr(e)))
def stream_iterator(file, chunk_size, **kwargs): """ Lazy function to read a file piece by piece. Default chunk size: 4MB. """ try: if chunk_size is None or not isinstance(chunk_size, int): chunk_size = 4096 * 1024 with open(file, "rb") as f: while True: file_part = f.read(chunk_size) if file_part: yield file_part else: break except Exception as e: nlogger.error('{fn} error: {e}'.format(fn='stream_iterator', e=traceback.format_exc())) raise FunctionError('{fn} error: {e}'.format(fn='stream_iterator', e=repr(e)))
def handle_data(row_object, **kwargs): """ Get absolute patch of file storage :param row_object: :param kwargs: :return: """ try: # kwargs['company_name'] = str(row_object.column_value.get('公司')).strip() _source_company_name = str(row_object.column_value.get('公司')).strip() kwargs['company_name'] = extract_company_name(_source_company_name) assert kwargs.get('company_name'), "company name is invalid" assert kwargs.get('samples_object'), "company dict is invalid" _row_object = set_row_object_company_info(row_object, **kwargs) return _row_object except AssertionError: raise except Exception as e: nlogger.error("{fn} error: {e}".format(fn='handle_data', e=traceback.format_exc())) raise HandleDataError("{fn} error: {e}".format(fn='handle_data', e=repr(e)))
def exec_func(check_file, file_name=None, sheet_name=None, start_point=None, end_point=None, **kwargs): """ Executive Function :param check_file: if check_file is True,then only check if download file exists. default False :param file_name: Excel file name :param sheet_name: sheet name, default active sheet :param start_point: start row number, minimum is 2 ( row 1 is column name) :param end_point: end row number , maximum is the row number of sheet :param kwargs: :return: """ try: # Construct dictionary of company _dict_file_name = check_file_name(SAMPLES_FILE, **kwargs) # _dict_file_name = check_file_name('会员单位名单.xlsx', **kwargs) _dict_xls, _dict_row_object_iterator = get_row_object_iterator( check_file, _dict_file_name, 'listing', **kwargs) samples_object = get_samples_object(_dict_row_object_iterator, **kwargs) nlogger.info(f"get_samples_object has been completed") # Prepare source data _data_file_name = check_file_name(file_name, **kwargs) _data_xls, _data_row_object_iterator = get_row_object_iterator( check_file, _data_file_name, sheet_name, start_point, end_point, **kwargs) nlogger.info(f"handle_data_thread start") _data_result = handle_data_thread( row_object_iterator=_data_row_object_iterator, samples_object=samples_object, **kwargs) nlogger.info(f"write_result_to_xls start") write_result_to_xls(_data_xls, _data_result) except (GetRowIterError, HandleDataError, ThreadTaskError, WriteResultError) as e: nlogger.error('{fn} Custom error: {e}'.format(fn='exec_func', e=repr(e))) print(f'Custom error: {repr(e)}') except AssertionError as e: nlogger.error('{fn} Assertion error: {e}'.format( fn='exec_func', e=traceback.format_exc())) print(repr(e)) except Exception as e: nlogger.error('{fn} error: {e}'.format(fn='exec_func', e=traceback.format_exc())) print(f'Undefined error: {repr(e)}')
def download_large_file(download_url, absolute_path_file, file_name, chunk_size, retry, **kwargs): """ Download large file from download url by breakpoint continuation :param download_url: url :param absolute_path_file: the file including absolute path :param file_name: file name :param chunk_size: chunk size :param retry: retry time :param kwargs: :return: file name or None """ _temp_size = check_temp_file_exists(absolute_path_file) headers = { 'Range': 'bytes=%d-' % _temp_size, 'Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, ' 'like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0' } for i in range(retry): try: with closing( download_file_requester.get_url(url=download_url, stream=True, headers=headers)) as res: with open(absolute_path_file, 'ab+') as f: for chunk in res.iter_content(chunk_size=chunk_size): if chunk: f.write(chunk) return f.name except RequestException as e: error_msg = f"Download failed::{file_name}, storage path is {absolute_path_file}," \ f"RequestException error: {repr(e)}" nlogger.error(error_msg) continue except Exception as e: error_msg = f"Download failed:{file_name}, storage path is {absolute_path_file}, undefined error: %s" nlogger.error(error_msg % (traceback.format_exc())) flogger.error(error_msg % (repr(e))) return else: error_msg = f"Download failed:{file_name}, storage path is {absolute_path_file}, {retry} retries failed" nlogger.error(error_msg) flogger.error(error_msg) return
def download_small_file(download_url, absolute_path_file, file_name, retry, **kwargs): """ Download small file :param download_url: url :param absolute_path_file: the file including absolute path :param file_name: file name :param retry: retry time :param kwargs: :return: download file name or None """ headers = { 'Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, ' 'like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0' } for i in range(retry): try: with closing( download_file_requester.get_url(url=download_url, stream=True, headers=headers)) as res: with open(absolute_path_file, mode='wb') as f: f.write(res.content) return f.name except RequestException as e: error_msg = f"Download failed::{file_name}, storage path is {absolute_path_file}," \ f"RequestException error: {repr(e)}" nlogger.error(error_msg) continue except Exception as e: error_msg = f"Download failed:{file_name}, storage path is {absolute_path_file}, undefined error: %s" nlogger.error(error_msg % (traceback.format_exc())) flogger.error(error_msg % (repr(e))) return else: error_msg = f"Download failed:{file_name}, storage path is {absolute_path_file}, {retry} retries failed" nlogger.error(error_msg) flogger.error(error_msg) return
def write_result_to_xls(source_xls, data_result): """ Write result in sheet of Excel :param source_xls: :param data_result: :return: """ try: column_name_list = source_xls.get_column_name_list() columns_number = len(column_name_list) y_result = recursive_get_index(column_name_list, 'result') + 1 if y_result > columns_number: source_xls.write_sheet_rows_value( sheet_name=source_xls.sheet.title, values=(1, y_result, 'result')) y_name = recursive_get_index(column_name_list, 'name') + 1 if y_name > columns_number: source_xls.write_sheet_rows_value( sheet_name=source_xls.sheet.title, values=(1, y_name, 'name')) y_guess_name = recursive_get_index(column_name_list, 'guess_name') + 1 if y_guess_name > columns_number: source_xls.write_sheet_rows_value( sheet_name=source_xls.sheet.title, values=(1, y_guess_name, 'guess_name')) y_full_name = recursive_get_index(column_name_list, 'full_name') + 1 if y_full_name > columns_number: source_xls.write_sheet_rows_value( sheet_name=source_xls.sheet.title, values=(1, y_full_name, 'full_name')) y_type = recursive_get_index(column_name_list, 'type') + 1 if y_type > columns_number: source_xls.write_sheet_rows_value( sheet_name=source_xls.sheet.title, values=(1, y_type, 'type')) y_similarity = recursive_get_index(column_name_list, 'similarity') + 1 if y_similarity > columns_number: source_xls.write_sheet_rows_value( sheet_name=source_xls.sheet.title, values=(1, y_similarity, 'similarity')) for row_object in data_result: x = row_object.position y = y_result values = (x, y, row_object.column_value.get('result', 'unknown')) source_xls.write_sheet_rows_value(sheet_name=row_object.sheet_name, values=values) if row_object.column_value.get('company_name'): y = y_name values = (x, y, row_object.column_value.get('company_name')) source_xls.write_sheet_rows_value( sheet_name=row_object.sheet_name, values=values) if row_object.column_value.get('guess_name'): y = y_guess_name values = (x, y, row_object.column_value.get('guess_name')) source_xls.write_sheet_rows_value( sheet_name=row_object.sheet_name, values=values) if row_object.column_value.get('company_full_name'): y = y_full_name values = (x, y, row_object.column_value.get('company_full_name')) source_xls.write_sheet_rows_value( sheet_name=row_object.sheet_name, values=values) if row_object.column_value.get('company_type'): y = y_type values = (x, y, row_object.column_value.get('company_type')) source_xls.write_sheet_rows_value( sheet_name=row_object.sheet_name, values=values) if row_object.column_value.get('similarity'): y = y_similarity values = (x, y, row_object.column_value.get('similarity')) source_xls.write_sheet_rows_value( sheet_name=row_object.sheet_name, values=values) _result_file_name = "result_{d}.xlsx".format( d=datetime.now().strftime('%Y%m%d-%H:%M:%S')) source_xls.save(_result_file_name) nlogger.info( f'Write result completed, output file: {_result_file_name}') except Exception as e: nlogger.error("{fn} error: {e}".format(fn='write_result_to_xls', e=traceback.format_exc())) raise WriteResultError("{fn} error: {e}".format( fn='write_result_to_xls', e=repr(e)))
def download_video(download_url, absolute_path_file, file_name, **kwargs): """ Use wget module to download video :param download_url: temporary download url :param absolute_path_file: storage absolute path of video file :param file_name: file_name of video :param kwargs: :return: """ try: _start_time = time.time() nlogger.info('Download start:{f}.'.format(f=absolute_path_file)) for i in range(3): try: # Because wget.download use ulib.urlretrieve, it has no timeout # So set socket.setdefaulttimeout(3600) to prevent jamming socket.setdefaulttimeout(SOCKET_TIMEOUT) download_file_name = wget.download(download_url, out=absolute_path_file, bar=None) _end_time = time.time() nlogger.info('Download success:{f}, it takes {t:.2f}s.'.format( f=absolute_path_file, t=_end_time - _start_time)) slogger.info( 'Download success:{f}, its storage path is {p}.'.format( f=file_name, p=absolute_path_file)) return download_file_name except socket.timeout: error_msg = f"Download timeout: {file_name},storage path is {absolute_path_file},url is{download_url}" nlogger.error(error_msg) if i > 1: raise WGetError(error_msg) else: time.sleep(1) continue except HTTPError as e: error_msg = "Download HTTPError:{0}, {1}, {2}, storage path is {3}".format( file_name, download_url, e.code, absolute_path_file) nlogger.error(error_msg) if i > 1: raise WGetError(error_msg) else: time.sleep(random.randint(1, 2)) continue except URLError as e: error_msg = "Download URLError:{0}, {1}, {2}, storage path is {3}".format( file_name, download_url, e.reason, absolute_path_file) nlogger.error(error_msg) if i > 1: raise WGetError(error_msg) else: time.sleep(random.randint(1, 2)) continue except Exception as e: nlogger.error( 'Download failed:{f}, storage path is {p},WGet error: {e}'. format(f=file_name, p=absolute_path_file, e=traceback.format_exc())) raise WGetError( 'Download failed:{f},storage path is {p},WGet error: {e}'. format(f=file_name, p=absolute_path_file, e=repr(e))) else: error_msg = 'Download failed:{f},storage path is {p},WGet retry failed'.format( f=file_name, p=absolute_path_file) nlogger.error(error_msg) raise WGetError(error_msg) except WGetError as e: flogger.error(repr(e)) return except Exception as e: nlogger.error('{fn} download {f} error: {e}'.format( fn='download_video', f=file_name, e=traceback.format_exc())) flogger.error("Download failed:{f}, error: {e}".format(f=file_name, e=repr(e))) return