def doIt(author, url): mysql_dao = MysqlDao() headers = Headers.get_headers() proxies = Proxies.get_proxies() try: html = requests.get(url, headers=headers, timeout=30, proxies=proxies).content selector = etree.HTML(html) titles = selector.xpath('//h3/a[1]/text()') urls = selector.xpath('//h3/a[1]/@href') imgs = selector.xpath( '//div[@class="list_image"]/ul[1]/li[1]/a[1]/img[1]/@src') next_name = selector.xpath('//*[@id="pagebar"]/a[last()]/text()') next_url = selector.xpath('//*[@id="pagebar"]/a[last()]/@href') category_id = 0 i = 0 print(urls) while True: if i >= len(urls): break url2 = urls[i] img_main = imgs[i] created_at = time.strftime('%Y-%m-%d %H:%M:%S') insert_value = '"' + str( category_id ) + '","' + url2 + '","' + img_main + '","' + author + '",0,"' + created_at + '"' sql = 'insert ignore into zmt_toutiao_url (`category_id`,`url`,`img_main`,`author`,`status`,`created_at`) values (' + insert_value + ')' print(sql) mysql_dao.execute(sql) i = i + 1 except Exception as e: print(Exception) print(e) try: # 翻页 next_name = selector.xpath('//*[@id="pagebar"]/a[last()]/text()') if len(next_name) > 0: if u'下一页' in next_name[0]: next_url = selector.xpath( '//*[@id="pagebar"]/a[last()]/@href')[0] doIt(author, next_url) except Exception as e: print(Exception) print(e)
#-*- coding:utf-8 -*- import sys import simplejson from public.mysqlpooldao import MysqlDao from public.redispooldao import RedisDao redis_key = 'dianpingtest:20170104_dianping_shop_list_url' mysql_dao = MysqlDao() redis_dao = RedisDao() reload(sys) sys.setdefaultencoding('utf-8') if __name__ == '__main__': sql = 'SELECT * FROM `20170104_dianping_shop_list_url` WHERE `status`=0' district_lists = mysql_dao.execute(sql) for district_list in district_lists: district_list_json = simplejson.dumps(district_list) redis_dao.rpush(redis_key, district_list_json) print district_list_json
print(e) try: # 翻页 next_name = selector.xpath('//*[@id="pagebar"]/a[last()]/text()') if len(next_name) > 0: if u'下一页' in next_name[0]: next_url = selector.xpath( '//*[@id="pagebar"]/a[last()]/@href')[0] doIt(author, next_url) except Exception as e: print(Exception) print(e) if __name__ == '__main__': mysql_dao = MysqlDao() while True: sql = 'select * from zmt_toutiaohao_url WHERE `time`=0 limit 0,1' ret = mysql_dao.execute(sql) if len(ret) == 0: break res = ret[0] id = res[0] author = res[1] url = res[2] # sql = 'update zmt_toutiaohao_url set `time`=1 where `id`=' + str(id) # res = mysql_dao.execute(sql) doIt(author, url) mysql_dao.close() print('game over')
sql7 = 'INSERT INTO cn163_download_info (subject,url,category_name,download_url,download_name,content) VALUES ("%s","%s","%s","%s","%s","%s")' % values mysql_dao.execute(sql7) print sql7 xx_xx_xx = selector1.xpath('//text()') xx_aa = selector1.xpath('//a/text()') output = set(xx_xx_xx) - set(xx_aa) for x in output: abs = re.findall('(.*mp4|.*mkv|.*720P)', x) abs_url = '' if abs: for abs_single in abs: abs_single_text = abs_single.replace('|', '').replace( u'(', '') # print abs_url, abs_single_text, content values = (subject, url, category_name, abs_url, abs_single_text, content) sql8 = 'INSERT INTO cn163_download_info (subject,url,category_name,download_url,download_name,content) VALUES ("%s","%s","%s","%s","%s","%s")' % values mysql_dao.execute(sql8) print sql8 if __name__ == '__main__': sql = 'SELECT `subject`,`single_url`,`category_name` FROM all_thread_list' res = mysql_dao.execute(sql) # print res for (subject, url, category_name) in res: print subject, url, category_name get_download_info(subject, url, category_name)
except: print('error') else: print(req.status_code) page = page - 1 if __name__ == '__main__': while True: district_list_json = redis_dao.lpop(redis_key) if district_list_json is None: break district_list = simplejson.loads(district_list_json) list_id = district_list[0] city_name = district_list[1] district_name = district_list[2] category_name = district_list[3] list_url = district_list[4] print list_id, city_name, district_name, category_name, list_url last_page = get_last_page(list_url) try: get_shop_info(list_url, last_page, city_name, district_name, category_name, list_id) except Exception as e: traceback.print_exc() print(e) continue sql = 'UPDATE `20170104_dianping_shop_list_url` SET `status`="1" WHERE (`id`="%s")' % list_id print(sql) mysql_dao.execute(sql)
conn.commit() for (id, objid, url, title) in res: id = id objid = objid title = title base_news_url = url comment_url = 'http://reply.autohome.com.cn/api/comments/show.json?count=50&page=1&id=' + objid + '&appid=1&datatype=jsonp&order=0&replyid=0' res = requests.get(comment_url) if res.status_code == 200: wb_data = res.content wb_data_qunull = wb_data.replace('null', '0') req = eval(wb_data_qunull) comment_count = int(req['commentcountall']) if comment_count % 50 != 0: totalpage = comment_count / 50 + 1 else: totalpage = comment_count / 50 pagenum = totalpage if pagenum >= 2: lastpage = 2 else: lastpage = pagenum try: get_commentInfo(objid, base_news_url, title, lastpage) except: print 'error' else: sql2 = 'UPDATE `xcar_news_post_20170504` SET `status_comment`="0" WHERE (`id`="%s")' % id print(sql2) mysql_dao.execute(sql2)
res = cur.fetchall() conn.commit() for (id, objid, url, title, picture_if) in res: print url id = id objid = objid title = title picture_if = picture_if news_url = url print picture_if if picture_if == '0': try: get_normalnews(objid, news_url, title, picture_if) except: print 'error' else: sql3 = 'UPDATE `xcar_news_post_20170504` SET `status_catch_picture`="0" WHERE (`id`="%s")' % id print(sql3) mysql_dao.execute(sql3) else: try: get_picturenews(objid, news_url, title, picture_if) except: print 'error' else: sql4 = 'UPDATE `xcar_news_post_20170504` SET `status_catch_picture`="0" WHERE (`id`="%s")' % id print(sql4) mysql_dao.execute(sql4)
# -*- coding: utf-8 -*- import sys reload(sys) sys.setdefaultencoding('utf-8') import simplejson from public.mysqlpooldao import MysqlDao from public.redispooldao import RedisDao redis_key = 'gaode:20170209_gaode_dianping_sectionl' mysql_dao = MysqlDao() redis_dao = RedisDao() if __name__ == '__main__': sql = 'SELECT * FROM `a_gaode_section_longitude_latitude` WHERE `status`=0' section_lists = mysql_dao.execute(sql) # print section_lists for section_list in section_lists: section_list_json = simplejson.dumps(section_list) print section_list_json redis_dao.rpush(redis_key, section_list_json) print(section_list_json)
if judge_if: # print u'正常页面' nextpage_node = selector.xpath( '//div[@class="area article"]/div[@class="page"]/span[@class="page-item-info"]/text()' ) if nextpage_node: lastpage = int(nextpage_node[0].replace(u'共', '').replace(u'页', '')) try: get_newsPostInfo(objid, url, lastpage, title, publish_date) except: print "error" else: sql3 = 'UPDATE `xcar_news_thread_20170503` SET `status`="0" WHERE (`id`="%s")' % id print(sql3) mysql_dao.execute(sql3) else: lastpage = 1 try: get_newsPostInfo(objid, url, lastpage, title, publish_date) except: print "error" else: sql1 = 'UPDATE `xcar_news_thread_20170503` SET `status`="0" WHERE (`id`="%s")' % id print(sql1) mysql_dao.execute(sql1) else: print u"图片新闻" # print url
def run(self): mysql_dao = MysqlDao() redis_dao = RedisDao() while True: print(self.getName()) date = time.strftime('%Y%m%d') data_json = redis_dao.lpop('queue:toutiao_%s' % date) if data_json == None: break data = simplejson.loads(data_json) category_id = data['category_id'] url = data['url'] img_main = data['img_main'] author = data['author'] try: headers = Headers.get_headers() proxies = Proxies.get_proxies() html = requests.get(url, headers=headers, timeout=30, proxies=proxies).content selector = etree.HTML(html) status = selector.xpath('//*[@id="aboutus"]/div[1]/span[1]/text()') if len(status) > 0: if u'今日头条' in status[0]: category_names = selector.xpath('//div[@class="curpos"]/a[2]/text()') if len(category_names) != 0: category_name = category_names[0] if u'图片' in category_name and u'视频' in category_name: pass else: if category_id != 0: toutiaohao_authors = selector.xpath('//*[contains(@class,"gc_name")]/text()') toutiaohao_urls = selector.xpath('//*[contains(@class,"gc_name")]/@href') try: toutiaohao_num = 0 for toutiaohao_url in toutiaohao_urls: toutiaohao_sql = 'insert ignore into zmt_toutiaohao_url (`author`,`url`) values ("' + \ toutiaohao_authors[toutiaohao_num] + '","' + \ toutiaohao_urls[ toutiaohao_num] + '")' toutiaohao_num = toutiaohao_num + 1 mysql_dao.execute(toutiaohao_sql) except Exception as e: print(Exception) print(e) title = selector.xpath('//*[@class="title"]/text()') if len(title) > 0: title_t = title[0].replace('"', '') else: title_t = '' content = selector.xpath('//*[@class="article-content"]/descendant::text()') img = selector.xpath('//img[@onerror="javascript:errorimg.call(this);"]/@src') content_str = '' img_str = '' for c in content: content_str = content_str + '{ycontent}' + c.replace('"', '') for img_i in img: img_str = img_str + '{yimg}' + img_i.replace('"', '') time_now = time.strftime('%Y-%m-%d %H:%M:%S') time_ts = selector.xpath('//*[@class="time"]/text()') if len(time_ts) > 0: time_t = time_ts[0].replace('"', '') else: time_t = '' insert_value = '"' + str( category_id) + '","' + title_t + '","' + content_str + '","' + url + '","' + img_main + '","","' + img_str + '","","' + author + '","' + time_t + '","' + time_now + '","' + time_now + '"'; sql = 'insert ignore into zmt_content (`category_id`,`title`,`content`,`url`,`img_main`,`img_main_oss`,`img`,`img_oss`,`author`,`time`,`created_at`,`updated_at`) values (' + insert_value + ')' print(sql) if content_str != '': mysql_dao.execute(sql) except Exception as e: print(Exception) print(e) mysql_dao.close()
def get_keywords(): mysql_dao = MysqlDao() sql = 'select `names_chn`,`names_eng`,`names_nick`,`directors`,`writers`,`casts`FROM bttiantang_content' res = mysql_dao.execute(sql) return res
import json import datetime from public.mysqlpooldao import MysqlDao from public.redispooldao import RedisDao redis_key = 'gaode:20170214_gaode_shop_info' mysql_dao = MysqlDao() redis_dao = RedisDao() class CJsonEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, datetime.datetime): return obj.strftime('%Y-%m-%d %H:%M:%S') elif isinstance(obj, datetime.date): return obj.strftime('%Y-%m-%d') else: return json.JSONEncoder.default(self, obj) if __name__ == '__main__': sql = 'SELECT * FROM `c_gaode_dianping_shop_info` WHERE `status`=0 limit 100000' print sql section_lists = list(mysql_dao.execute(sql)) random.shuffle(section_lists) for section_list in section_lists: section_list_json = json.dumps(section_list, cls=CJsonEncoder) # print section_list_json redis_dao.rpush(redis_key, section_list_json) print(section_list_json)