def getUrl(url, page): url = url + str(page) + '.html' print(url) mysqlDao = MysqlDao() try: n = 1 while True: try: headers = Headers.getHeaders() req = requests.get(url, headers=headers, timeout=10) break except Exception, e: print Exception, ":", e print('sleep') time.sleep(n * 10) n = n + 1 if req.status_code == 200: html = req.content selector = etree.HTML(html) url_contents = selector.xpath('//div[@class="box3"]/descendant::a/@href') for url_content in url_contents: sql = 'insert ignore into loldytt_url (`category_id`,`url`,`status`,created_at) VALUES (%s,%s,%s,%s)' created_at = time.strftime('%Y-%m-%d %H:%M:%S') values = (category_id, url_content, 0, created_at) print(values) mysqlDao.executeValues(sql, values) mysqlDao.close()
def getContent(self, url, category_id): headers = Headers.getHeaders() sleep_time = 1 while True: try: req = requests.get(url, headers=headers, timeout=30) break except: print('sleep10') time.sleep(10 * sleep_time) sleep_time = sleep_time + 1 if req.status_code == 200: html = req.content.decode('gb2312', 'ignore') selector = etree.HTML(html) root_path = selector.xpath('//div[contains(@id,"Zoom")]') names = selector.xpath( '//div[contains(@class,"title_all")]/h1/font/text()') if len(root_path) > 0: contents = simplejson.dumps( root_path[0].xpath('descendant::text()')) imgs = simplejson.dumps( root_path[0].xpath('descendant::img/@src')) if len(names) > 0: name = names[0] else: name = '' print(name) created_at = time.strftime('%Y-%m-%d %H:%M:%S') sql_values = (category_id, name, contents, imgs, created_at, url) return sql_values else: pass
def bing(self, word): ret = [] headers = Headers.getHeaders() url = 'http://global.bing.com/search?q=' + word req = requests.get(url, headers=headers, timeout=30) if req.status_code == 200: html = req.content selector = etree.HTML(html) words = selector.xpath( '//li[@class="b_ans"]/ul/li/a/descendant::text()') ret.extend(words) return ret
def baidu(self, word): ret = [] url = 'http://m.baidu.com/s?word=' + word headers = Headers.getHeaders() req = requests.get(url, headers=headers, timeout=30) if req.status_code == 200: html = req.content selector = etree.HTML(html) words = selector.xpath('//div[@class="rw-list"]/a/text()') ret.extend(words) print('baidu', ret) return ret
def getUrlLast(url): n = 1 print(url) while True: try: headers = Headers.getHeaders() req = requests.get(url, headers=headers, timeout=10) break except Exception, e: print Exception, ":", e print('sleep') time.sleep(n * 10) n = n + 1
def so(self, word): ret = [] headers = Headers.getHeaders() headers['Referer'] = 'https://www.so.com/' url = 'http://www.so.com/s?q=' + word req = requests.get(url, headers=headers, timeout=30) if req.status_code == 200: html = req.content selector = etree.HTML(html) words = selector.xpath('//div[@id="rs"]/table/tr/th/a/text()') ret.extend(words) print('so', ret) return ret
def sogou(self, word): ret = [] headers = Headers.getHeaders() headers['Referer'] = 'https://www.sogou.com/' url = 'http://m.sogou.com/web/searchList.jsp?pg=webSearchList&v=2&keyword=' + word req = requests.get(url, headers=headers, timeout=30) if req.status_code == 200: html = req.content selector = etree.HTML(html) words = selector.xpath('//div[@class="bc relate"]/a/text()') ret.extend(words) print('sogou', ret) return ret
def getContentUrl(url, category_id, mysqlDao): headers = Headers.getHeaders() req = requests.get(url, headers=headers, timeout=60) if req.status_code == 200: html = req.content selector = etree.HTML(html) content_urls = selector.xpath('//ul[@class="inqList pt18"]/li/a/@href') content_urls.reverse() for content_url in content_urls: content_url = Config.url_main + content_url created_at = time.strftime('%Y-%m-%d %H:%M:%S') sql = 'insert ignore into m1905_url (`category_id`,`url`,`status`,`created_at`) VALUES (%s,%s,%s,%s)' values = (category_id, content_url, 0, created_at) mysqlDao.executeValues(sql, values)
def getLastPage(url): last_page = 10 headers = Headers.getHeaders() req = requests.get(url, headers=headers, timeout=30) if req.status_code == 200: html = req.content selector = etree.HTML(html) movie_count_text = selector.xpath( '//div[@class="termsBox"]/div[1]/text()') if len(movie_count_text) > 0: movie_count = int( filter(str.isdigit, movie_count_text[0].encode('utf8'))) last_page = int(movie_count / 30) if movie_count % 30 > 0: last_page = last_page + 1 return last_page
def getUrl(url, category_id): print(url) if url == 'http://www.loldytt.com/Zuixinhanju/chart/26.html': url = 'http://www.loldytt.com/Zuixinhanju/chart/25.html' mysqlDao = MysqlDao() n = 1 while True: try: headers = Headers.getHeaders() req = requests.get(url, headers=headers, timeout=10) break except Exception, e: print Exception, ":", e print('sleep') time.sleep(n * 10) n = n + 1
def baiduzhidaosearch(keyword, page): ret = { 'code': 1002, 'msg': 'failure', 'data': [] } try: page = int(page) * 10 print 111 keyword_u = keyword.encode('utf-8') print chardet.detect(keyword_u) # url = 'http://zhidao.baidu.com/search?word=%s&ie=gbk&site=-1&sites=0&date=0&pn=%s' % (keyword.encode('utf-8 ').decode('gbk','ignore'), page) url = u'http://zhidao.baidu.com/search?ct=17&pn=%s&tn=ikaslist&rn=10&word=%s' % (page, keyword) #print(url) print 222 headers = Headers.getHeaders() proxies = Proxies.get_proxies() req = requests.get(url, headers=headers, timeout=60, proxies=proxies) if req.status_code == 200: ret['code'] = 1001 ret['msg'] = 'success' id = [] title = [] req.encoding = 'gbk' html = req.text.encode(encoding="utf-8", errors="ignore").decode("utf-8", errors="ignore") selector = etree.HTML(html) urls = selector.xpath('//div[@class="list"]/dl/dt[1]/a/@href') for u in urls: match_obj = re.search(r'question/(.*?).html', u, re.M | re.I) id.append(match_obj.group(1)) titles = selector.xpath('//div[@class="list"]/dl/dt[1]/a') for t in titles: title.append(etree.tostring(t, encoding='utf8', method="html")) max_n = len(id) n = 0 while True: if n >= max_n: break # print(title[n]) ret['data'].append( {'cid': id[n], 'title': re.search(r'"ti">(.*?)</a>', title[n], re.M | re.I).group(1)}) n = n + 1 except Exception as e : print e return simplejson.dumps(ret)
def getPageCount(url, first_page): num = 0 headers = Headers.getHeaders() try: url = url + str(first_page) + '.html' req = requests.get(url, headers=headers, timeout=30) if req.status_code == 200: html = req.content selector = etree.HTML(html) page_counts = selector.xpath('//div[@class="pagebox"]/span/text()') if len(page_counts) > 0: page_count = page_counts[0] page_count_list = page_count.split(u'部') if len(page_count_list) >= 2: aaa = page_count_list[1] bbb = aaa.split(u'页') if len(bbb) > 0: page = bbb[0] if page.isdigit() == True: num = page except Exception, e: print Exception, ":", e
关键词搜索 ''' import sys import time import requests from headers import Headers from lxml import etree from mysqlpooldao import MysqlDao reload(sys) sys.setdefaultencoding('utf8') while True: try: url = 'http://top.baidu.com/' headers = Headers.getHeaders() req = requests.get(url, headers=headers, timeout=30) if req.status_code == 200: html = req.content.decode('gb2312', 'ignore') selector = etree.HTML(html) words = selector.xpath( '//div[@id="box-cont"]/descendant::a/@title') for word in words: print(word) mysqlDao = MysqlDao() sql = 'insert ignore into allsearch_key_word (`word`,`parent_id`,`status`,`created_at`) VALUES (%s,%s,%s,%s)' created_at = time.strftime('%Y-%m-%d %H:%M:%S') values = (word, 0, 0, created_at) mysqlDao.executeValues(sql, values) mysqlDao.close() else:
def run(self): while True: print(self.name) mysqlDao = MysqlDao() sql = 'select * from loldytt_url WHERE `status`=0 limit 0,1' ret = mysqlDao.execute(sql) if len(ret) == 0: mysqlDao.close() """ 不用睡眠直接退出等crontab唤醒 """ print('game over') sys.exit() else: res = ret[0] id = res[0] category_id = res[1] url = res[2] sql = 'update loldytt_url set `status`=2 where `id`=' + str(id) mysqlDao.execute(sql) headers = Headers.getHeaders() n = 0 while n < 5: req = requests.get(url, headers=headers) req.encoding = "gbk" if req.status_code == 200: html = req.text.encode(encoding="utf-8", errors="ignore").decode( "utf-8", errors="ignore") try: selector = etree.HTML(html) except: print 333 titles = selector.xpath( '//div[contains(@class,"lm")]/h1/a/text()') if len(titles) > 0: break n = n + 1 if len(titles) > 0: title = titles[0] else: continue casts = selector.xpath( '//div[contains(@class,"zhuyan")]/ul[1]/li/text()') imgs = selector.xpath( '//div[contains(@class,"haibao")]/a[1]/img/@src') cast = '' img = '' content = '' if len(casts) > 0: cast = casts[0].split(':')[1] if len(imgs) > 0: img = imgs[0] contents = selector.xpath( '//div[@class="neirong"]/descendant::text()') if len(contents) > 0: content = simplejson.dumps(contents) created_at = time.strftime('%Y-%m-%d %H:%M:%S') xunlei_download_keys = selector.xpath( '//*[contains(@id,"jishu")]/descendant::a[contains(@href,"thunder")]/text()' ) xunlei_download_values = selector.xpath( '//*[contains(@id,"jishu")]/descendant::a[contains(@href,"thunder")]/@href' ) bt_download_keys = selector.xpath( '//*[contains(@id,"bt")]/descendant::a[contains(@href,"thunder")]/text()' ) bt_download_values = selector.xpath( '//*[contains(@id,"bt")]/descendant::a[contains(@href,"thunder")]/@href' ) magnet_download_keys = selector.xpath( '//a[contains(@href,"magnet")]/text()') magnet_download_values = selector.xpath( '//a[contains(@href,"magnet")]/@href') xunlei_download = [] bt_download = [] magnet_download = [] try: xn = 0 for x in xunlei_download_keys: xunlei_download.append({ xunlei_download_keys[xn]: xunlei_download_values[xn] }) xn = xn + 1 bn = 0 for b in bt_download_keys: bt_download.append( {bt_download_keys[bn]: bt_download_values[bn]}) bn = bn + 1 mn = 0 for m in magnet_download_keys: magnet_download.append({ magnet_download_keys[mn]: magnet_download_values[mn] }) mn = mn + 1 except Exception, e: print Exception, ":", e xunlei_download_json = simplejson.dumps(xunlei_download) bt_download_json = simplejson.dumps(bt_download) magnet_download_json = simplejson.dumps(magnet_download) sql_pattern = 'insert ignore INTO `loldytt_content`(`category_id`, `title`,`cast`,`img`,`xunlei_download`, `bt_download`, `magnet_download`, `content`, `url`,`created_at`) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' sql_values = (category_id, title, cast, img, xunlei_download_json, bt_download_json, magnet_download_json, content, url, created_at) print(title) mysqlDao.executeValues(sql_pattern, sql_values) sql = 'update loldytt_url set `status`=1 where `id`=' + str(id) mysqlDao.execute(sql) mysqlDao.close()
def run(self): while True: print(self.name) mysqlDao = MysqlDao() sql = 'select * from bttiantang_url WHERE `status`=0 limit 0,1' ret = mysqlDao.execute(sql) res = [] for r in ret: res = r print(res) if len(res) == 0: print('sleep') # sql = 'update yingshi_bttiantang_url set `status`=0 WHERE `status`=2' # database.mysqlExecute(sql) mysqlDao.close() # time.sleep(21600) # continue """ 不用睡眠直接退出等crontab唤醒 """ print('game over') sys.exit() else: id = res[0] url = res[1] sql = 'update bttiantang_url set `status`=2 where `id`=' + str( id) mysqlDao.execute(sql) headers = Headers.getHeaders() n = 0 while n < 5: req = requests.get(url, headers=headers) if req.status_code == 200: html = req.content selector = etree.HTML(html) contents = selector.xpath( '//ul[contains(@class,"moviedteail_list")]') if len(contents) > 0: break n = n + 1 if len(contents) > 0: content = contents[0] else: continue names_chn = selector.xpath( '//div[contains(@class,"moviedteail_tt")]/h1/text()') names_eng = selector.xpath( '//div[contains(@class,"moviedteail_tt")]/span/text()') name_chn = '' name_eng = '' if len(names_chn) > 0: name_chn = names_chn[0] if len(names_eng) > 0: name_eng = names_eng[0] names_nick = content.xpath( 'li[contains(text(),"%s")]/a/text()' % (u'又名')) if len(names_nick) > 0: names_nick_new = ",".join(names_nick) else: names_nick_new = "" imgs = simplejson.dumps( selector.xpath( '//div[contains(@class,"moviedteail_img")]/a/img/@src') ) tags = content.xpath('li[contains(text(),"%s")]/a/text()' % (u'标签')) if len(tags) > 0: tags_new = ",".join(tags) else: tags_new = "" areas = content.xpath('li[contains(text(),"%s")]/a/text()' % (u'地区')) if len(areas) > 0: areas_new = ",".join(areas) else: areas_new = "" years = content.xpath('li[contains(text(),"%s")]/a/text()' % (u'年份')) if len(years) > 0: years_new = ",".join(years) else: years_new = "" directors = content.xpath( 'li[contains(text(),"%s")]/a/text()' % (u'导演')) if len(directors) > 0: directors_new = ",".join(directors) else: directors_new = "" writers = content.xpath('li[contains(text(),"%s")]/a/text()' % (u'编剧')) if len(writers) > 0: writers_new = ",".join(writers) else: writers_new = "" casts = content.xpath('li[contains(text(),"%s")]/a/text()' % (u'主演')) if len(casts) > 0: casts_new = ",".join(casts) else: casts_new = "" imdbs = content.xpath('li[contains(text(),"%s")]/a/text()' % (u'imdb')) if len(imdbs) > 0: imdbs_new = ",".join(imdbs) else: imdbs_new = "" details = self.getDetails( content.xpath('li[contains(text(),"%s")]/a/@href' % (u'详情'))) if len(details) > 0: details_new = details[0] else: details_new = "" created_at = time.strftime('%Y-%m-%d %H:%M:%S') downloads = selector.xpath('//div[contains(@class,"tinfo")]') download = [] for d in downloads: try: dn_text = d.xpath('a[1]/@title')[0] dn_url = d.xpath('a[1]/@href')[0] download.append({dn_text: dn_url}) except: pass download_json = simplejson.dumps(download) sql_pattern = 'insert ignore INTO `bttiantang_content`(`names_chn`, `names_eng`,`names_nick`,`imgs`,`tags`, `areas`, `years`, `directors`, `writers`,`casts`, `imdbs`,`details`, `download`,`created_at`, `url`) VALUES(%s, %s, %s,%s,%s,%s, %s, %s,%s, %s,%s, %s,%s, %s, %s)' sql_values = (name_chn, name_eng, names_nick_new, imgs, tags_new, areas_new, years_new, directors_new, writers_new, casts_new, imdbs_new, details_new, download_json, created_at, url) mysqlDao.executeValues(sql_pattern, sql_values) sql = 'update bttiantang_url set `status`=1 where `id`=' + str( id) mysqlDao.execute(sql) mysqlDao.close()