def Querst_request(): """通过GET网页获取网页的URL,并且更新库内的url值 """ session = returns_session('Sqlextend')() to = ot_baidu_search_info data = session.query(to).filter('id > 694').all() if not data: return for item in data: new = ot_baidu_search_info() new.id = item.id while True: try: r_url = get(item.url) break except: time.sleep(1) print 'retry sleep 1' if r_url.ok: new.url = r_url.request.url print 'Update old_url = %s, New_url = %s' % (item.url, new.url) session.merge(new) try: session.commit() except: continue else: print item.id session.close()
def Querst_request(self): """搜索实例 """ point = insert_database('Sqlextend', tablename=ot_baidu_search_info) for key in self.keys: # self.firefox.get(self.url) for i in range(0, 20): self.set_pn(key, i) while True: try: self.firefox.get(self.url) break except: # self.firefox.quit() self.reset_firefox() continue data = self.firefox.page_source if data: xhtml = html.document_fromstring(data) content = zip(xhtml.xpath('//div[@id="content_left"]//div[@class="f13"]//div[@class="c-tools"]'), xhtml.xpath('//div[@id="content_left"]//div[@class="f13"]//span[@class="g"]')) for title, url in content: db = ot_baidu_search_info() try: db.title = json.loads( title.get('data-tools'))['title'].encode('utf8') except: try: db.title = title.get( 'data-tools').split(':')[1].split(',')[0].replace('"', '').encode('utf8') except IndexError: pass db.url = url.text_content().encode('utf8') db.key = key insert_database( 'Sqlextend', tablename=ot_baidu_search_info, editor=db) point.set_value(db) point.insert() """ for item in xhtml.xpath('//div[@id="content_left"]//div[@class="f13"]'): #print item.get('href'), item.text_content().encode('utf8') db = ot_baidu_search_info() import pdb pdb.set_trace() db.title = item.xpath('//h3//a')[0].title db.url = item.xpath('//span[@class="g"]')[0].text_content #db.url = item.get('href') db.key = key insert_database('Sqlextend', tablename = ot_baidu_search_info, editor = db) point.set_value(db) point.insert() """ time.sleep(2) self.firefox.close()