def _get_inc_intro(doc): els = doc.xpath("//div[@class='tCompany_text_gsjs']") for el in els: r = etree.tounicode(el, pretty_print=True) r = HtmlFind.remove_tag(r) return r return ''
def parse(self): top_info = self._doc.xpath("//div[@class='cn']") if not top_info: raise Exception("find top_info exception") position = top_info[0].xpath("h1/@title")[0] self.result.jobPosition = self.replace_pattern('\(.*?\)', '', position) self.result.jobWorkLoc = top_info[0].xpath("span[@class='lname']")[0].text_content() self.result.incName = top_info[0].xpath("p[@class='cname']/a/@title")[0] self.result.incUrl = top_info[0].xpath("p[@class='cname']/a/@href")[0] self.result.jobSalary = top_info[0].xpath("strong")[0].text_content() inc_info_part1 = top_info[0].xpath("p[@class='msg ltype']") if inc_info_part1: incType, incScale, incIndustry = inc_info_part1[0].text_content().split('|') self.result.incType = HtmlFind.remove_tag(incType, 1) self.result.incScale = HtmlFind.remove_tag(incScale, 1) self.result.incIndustry = HtmlFind.remove_tag(incIndustry, 1) ########################################################################### ########################################################################### mid_info = self._doc.xpath("//div[@class='jtag inbox']") if not mid_info: raise Exception("find mid_info exception") mid_elems = mid_info[0].xpath("div[@class='t1']/span[@class='sp4']") mid_field_list = ['jobWorkAge', 'jobPersonNumber'] i = 0 for mid_elem in mid_elems: if i >=2 : break setattr(self.result, mid_field_list[i], mid_elem.text_content()) i += 1 jobWelfare = '' mid_elems_2 = mid_info[0].xpath("p[@class='t2']/span") for el in mid_elems_2: jobWelfare += el.text_content() + ' ' self.result.jobWelfare = jobWelfare jobDesc = self._doc.xpath("//div[@class='bmsg job_msg inbox']") if not jobDesc: raise Exception("get jobdesc exception") jobDescription = HtmlFind.remove_tag(html.tostring(jobDesc[0], encoding='utf-8'), 1) index = jobDescription.find('关键字:') if index != -1: jobDescription = jobDescription[:index] self.result.jobDescription = jobDescription index = jobDescription.find('职能类别:') if index != -1: jobCate = jobDescription[index:] self.result.jobCate = self.replace_pattern('职能类别:', '', jobCate) inc_intro = self._doc.xpath('//div[@class="tmsg inbox"]') if inc_intro: raw_content = html.tostring(inc_intro[0], encoding='utf-8') raw_content = HtmlFind.remove_tag(raw_content, 1) self.result.incIntro = raw_content self.myprint()
def _set_job_others(ret, doc): els = doc.xpath('//div[@class="tCompany_basic_job"]/dl[@class="lineDl"]') for dl in els: dts = dl.xpath("dt") dds = dl.xpath("dd") for i in range(0, len(dts)): key = dts[i].text_content().strip() value = dds[i].text_content().strip() if isinstance(key, unicode): key = key.encode("utf8") if "发布日期:" == key: ret.pubDate = value if "工作地点:" == key: ret.jobWorkLoc = value if "招聘人数:" == key: ret.jobPersonNumber = value if "工作年限:" == key: ret.jobWorkAge = value if "学历要求:" == key: ret.jobDiploma = value if "薪资范围:" == key: ret.jobSalary = value if "薪酬福利:" == key: els = dds[i].xpath('span') job_welfare_list = [] for el in els: job_welfare_list.append(el.text_content().strip()) ret.jobWelfare = ",".join(job_welfare_list) if "职能类别:" == key: job_cate_list = [] els = dds[i].xpath('a') for el in els: job_cate_list.append(el.text_content().strip()) ret.jobCate = ",".join(job_cate_list) if "职位标签:" == key: job_tags_list = [] els = dds[i].xpath('a') for el in els: job_tags_list.append(el.text_content().strip()) ret.jobTags = ",".join(job_tags_list) # job描述 els = doc.xpath("//div[@class='tCompany_text']/ul") r = "" for el in els: r += etree.tounicode(el, pretty_print=True) r = HtmlFind.remove_tag(r) ret.jobDescription = r # job职位 a = doc.xpath("//li[@class='tCompany_job_name']") if a: ret.jobPosition = a[0].text_content().strip() else: ret.jobPosition = ''