def load(html, encode, xpaths): parser = HtmlParser(html, encode) parser.parse() for key in xpaths: xpath = xpaths.get(key) elements = parser.get_element_by_xpath(xpath, encode) value = elements[0][2].encode('utf-8')
def load(id,tm,url,html,encode, xpaths): parser = HtmlParser(html,encode) parser.parse() db_sql = "insert into job_detail(url,src_desc,type,title,\ keywords,department,job_require,job_duty,\ job_welfare,label,company,company_desc,\ logo,salary,work_experience,\ edu, field,location,head_count,pub_time) values(" jd = page_pb2.JobDescription() js ="{\"pub_tm\":\"" + tm + "\"," js = js + "\"url\":\"" + url + "\"," for key in xpaths: # print "[ON]handle " + key xpath=xpaths.get(key) elements = parser.get_element_by_xpath(xpath,encode) if (len(elements) == 0): print "[ERR] " + key continue value = elements[0][2].encode('utf-8') js += "\"" + key + "\":\"" + value + "\"," # set_pb(jd,key,value) fp=open("./data/"+id+".dat",'w') fp.write(js.rstrip(',') + "}") fp.close()
def load(id,html,encode, xpaths): parser = HtmlParser(html,encode) parser.parse() jd = page_pb2.JobDescription() js ="{"; for key in xpaths: # print "[ON]handle " + key xpath=xpaths.get(key) elements = parser.get_element_by_xpath(xpath,encode) if (len(elements) == 0): print "[ERR] " + key continue value = elements[0][2].encode('utf-8') js += "\"" + key + "\":\"" + value + "\"," # set_pb(jd,key,value) fp=open("./data/"+id+".dat",'w') fp.write(js.rstrip(',') + "}") fp.close()
def load(id, html, encode, xpaths): parser = HtmlParser(html, encode) parser.parse() jd = page_pb2.JobDescription() js = "{" for key in xpaths: # print "[ON]handle " + key xpath = xpaths.get(key) elements = parser.get_element_by_xpath(xpath, encode) if (len(elements) == 0): print "[ERR] " + key continue value = elements[0][2].encode('utf-8') js += "\"" + key + "\":\"" + value + "\"," # set_pb(jd,key,value) fp = open("./data/" + id + ".dat", 'w') fp.write(js.rstrip(',') + "}") fp.close()