def detail_new(selector): address = xpath_handler(selector.xpath(settings.ADDRESS_MFW_NEW)) intro = xpath_handler(selector.xpath(settings.INTRO_MFW_NEW)) comment = xpath_handler(selector.xpath(settings.COMMENT_MFW_NEW)) contact = xpath_handler(selector.xpath(settings.CONTACT_MFW_NEW)) grade = xpath_handler(selector.xpath(settings.GRADE_MFW_NEW)) return address, intro, comment, contact, grade
def xc_scene(url): rsp = requests.get(url=url, headers=HEADER).text selector = etree.HTML(rsp) nodes = selector.xpath(settings.NODES_XC) if nodes: for node in nodes: name = xpath_handler(node.xpath(settings.NAME_XC)) address = xpath_handler(node.xpath(settings.ADDRESS_XC)) grade = xpath_handler(node.xpath(settings.SCORE_XC)) comment = re.search('\d+', xpath_handler(node.xpath(settings.COMMENT_XC))) comment = comment.group() if comment else 0 url = detail_url = settings.DOMAIN_XC + xpath_handler( node.xpath(settings.DETAIL_XC)) intro, website, contact = get_detail(detail_url) params = dict( name=name.encode('utf-8'), address=address.encode('utf-8'), grade=float(grade.encode('utf-8')) if grade else 0, comment=comment.encode('utf-8'), url=url.encode('utf-8'), intro=intro.encode('utf-8'), website=website.encode('utf-8'), contact=contact.encode('utf-8'), ) print name MY_DB.insert(settings.QUERY_XC, params) else: print "***warning url:%s" % url
def get_detail(url): rsp = requests.get(url=url, headers=HEADER).text selector = etree.HTML(rsp) introduce = xpath_handler(selector.xpath(settings.INTRODUCE_XC)) website = xpath_handler(selector.xpath(settings.WEBSITE_XC)) phone = xpath_handler( filter(lambda x: re.search('\d', x), selector.xpath(settings.PHONE_XC))) return introduce, website, phone
def get_page(params): rsp = requests.post(url=settings.COMMENT_URL, headers=settings.HEADER, params=params).text selector = etree.HTML(rsp) page = selector.xpath(settings.PAGE_PATH) return int(xpath_handler(page, default_val=1))
def spider(params, level, page_num): params['pagenow'] = page_num rsp = requests.post(url=settings.COMMENT_URL, headers=settings.HEADER, params=params).text selector = etree.HTML(rsp) nodes = selector.xpath(settings.ROOT_PATH) for node in nodes: record = dict( nick=xpath_handler(node.xpath(settings.NICK_PATH)), date=xpath_handler(node.xpath(settings.DATE_PATH)), comment=xpath_handler(node.xpath(settings.COMMENT_PATH)), level=level, ) # print level, record['nick'] MY_DB.insert(settings.SQL_QUERY, record)
def detail_old(selector): address = xpath_handler(selector.xpath(settings.ADDRESS_MFW)) intro = xpath_handler(selector.xpath(settings.INTRO_MFW)) comment = xpath_handler(selector.xpath(settings.COMMENT_MFW)) comment = re.search('\d+', comment) comment = comment.group() if comment else 0 open = xpath_handler(selector.xpath(settings.OPEN_MFW)) time = xpath_handler(selector.xpath(settings.TIME_MFW)) contact = xpath_handler(selector.xpath(settings.CONTACT_MFW)) website = xpath_handler(selector.xpath(settings.WEBSITE_MFW)) return address, intro, comment, open, time, contact, website
def get_detail(url): rsp = requests.get(url=url, headers=HEADER).text selector = etree.HTML(rsp) name = xpath_handler(selector.xpath(settings.NAME_MFW)) if name: print 111, name address, intro, comment, open, time, contact, website = list(detail_old(selector)) print address, intro, comment, open, time, contact, website params = dict( name=name, address=address, intro=intro, comment=comment, open=open, time=time, contact=contact, website=website, url=url ) MY_DB.insert(settings.QUERY_MFW, params) else: name = xpath_handler(selector.xpath(settings.NAME_MFW_NEW)) if name: print 222, name address, intro, comment, contact, grade = list(detail_new(selector)) print address, intro, comment, contact, grade params = dict( name=name, address=address, intro=intro, comment=comment, contact=contact, grade=grade, url=url ) MY_DB.insert(settings.QUERY_MFW_NEW, params) else: print 333
def get_detail(url): rsp = requests.get(url=url, headers=HEADER, proxies=get_proxy()).text print 'get_detail_end...' selector = etree.HTML(rsp) name = xpath_handler(selector.xpath(settings.NAME_QNE)) address_phone = selector.xpath(settings.ADDRESS_PHONE_QNE) address = address_phone[0].strip() if len(address_phone) else '' contact = address_phone[1].strip() if len(address_phone) > 1 else '' coord = xpath_handler(selector.xpath(settings.COORD_QNE)) if coord: coord = coord.split(',') lon = coord[0] lat = coord[1] else: lon = lat = None grade = xpath_handler(selector.xpath(settings.GRADE_QNE)) or None comment = xpath_handler(selector.xpath(settings.COMMENT_QNE), 0) if comment: comment = re.search('\d+', comment) comment = comment.group() if comment else 0 open_time = xpath_handler(selector.xpath(settings.OPEN_QNE)) time_advise = xpath_handler(selector.xpath(settings.TIME_QNE)) time_advise = time_advise and time_advise.split(u':')[1] website = xpath_handler(selector.xpath(settings.WEBSITE_QNE)) intro = xpath_handler(selector.xpath(settings.INTRO_QNE)) if not name: return print name print address print contact print lon print lat try: grade = float(grade) except: grade = 0 print grade print comment print open_time print time_advise print website print intro params = dict( name=name, address=address, grade=grade, comment=comment, url=url, intro=intro, website=website, contact=contact, lon=lon, lat=lat, open=open_time, time=time_advise ) MY_DB.insert(settings.QUERY_QNE, params) # MY_DB.insert('delete from qne_url where url=%(url)s', dict(url=url)) if name: REDIS_CLIENT.set(url, 0)