def analy_html(html_text='', url_parse={}): arr_resume = [] dict_res = {} soup = BeautifulSoup(html_text, "lxml") for element in nb.clean_empty_array(soup.get_text('\n').split()): resume = {} if len(element) < 10: continue if nb.is_short_resume(element) or nb.is_job_history(element): resume['text'] = element if element != '': resume['img_url_previous'] = urlUtils.absoluteUrl( url_parse, find_img_url(soup, element)) resume['img_url_next'] = urlUtils.absoluteUrl( url_parse, find_img_url(soup, element, previous_direction=False)) else: continue arr_resume.append(resume) # 摘取照片 url arr_resume = clean_image_url(arr_resume) dict_res['resume'] = arr_resume return dict_res
def analy_html(html_text=''): arr_appoint = [] arr_resume = [] dict_res = {} soup = BeautifulSoup(html_text, "lxml") for element in nb.clean_empty_array(soup.get_text('\n').split()): if len(element) < 10: continue if nb.is_job_appoint(element): arr_appoint.append(element) continue dict_res['appoint'] = arr_appoint return dict_res
def get_resume_project_history(lines=[]): _lines = [] for line in lines: res = NbUtils.is_career_project_history(line) if (res): _lines.append(line) return _lines
def get_resume_kvtexts(lines=[]): _lines = [] for line in lines: res = NbUtils.is_career_resume_kv(line) if (res): _lines.append(line) return _lines
def analy_html(html_text=''): arr_reason = [] arr_resume = [] arr_result = [] dict_res = {} soup = BeautifulSoup(html_text, "lxml") for element in nb.clean_empty_array(soup.get_text('\n').split()): if len(element) < 10: continue if nb.is_punish_reason(element): arr_reason.append(element) continue if nb.is_punish_result(element): arr_result.append(element) continue if nb.is_short_resume(element) or nb.is_job_history(element): arr_resume.append(element) continue dict_res['reason'] = arr_reason dict_res['result'] = arr_result dict_res['resume'] = arr_resume return dict_res