예제 #1
0
def doIt(author, url):
    mysql_dao = MysqlDao()
    headers = Headers.get_headers()
    proxies = Proxies.get_proxies()
    try:
        html = requests.get(url, headers=headers, timeout=30,
                            proxies=proxies).content
        selector = etree.HTML(html)
        titles = selector.xpath('//h3/a[1]/text()')
        urls = selector.xpath('//h3/a[1]/@href')
        imgs = selector.xpath(
            '//div[@class="list_image"]/ul[1]/li[1]/a[1]/img[1]/@src')
        next_name = selector.xpath('//*[@id="pagebar"]/a[last()]/text()')
        next_url = selector.xpath('//*[@id="pagebar"]/a[last()]/@href')
        category_id = 0
        i = 0
        print(urls)
        while True:
            if i >= len(urls):
                break
            url2 = urls[i]
            img_main = imgs[i]
            created_at = time.strftime('%Y-%m-%d %H:%M:%S')
            insert_value = '"' + str(
                category_id
            ) + '","' + url2 + '","' + img_main + '","' + author + '",0,"' + created_at + '"'
            sql = 'insert ignore into zmt_toutiao_url (`category_id`,`url`,`img_main`,`author`,`status`,`created_at`) values (' + insert_value + ')'
            print(sql)
            mysql_dao.execute(sql)
            i = i + 1
    except Exception as e:
        print(Exception)
        print(e)
    try:
        # 翻页
        next_name = selector.xpath('//*[@id="pagebar"]/a[last()]/text()')
        if len(next_name) > 0:
            if u'下一页' in next_name[0]:
                next_url = selector.xpath(
                    '//*[@id="pagebar"]/a[last()]/@href')[0]
                doIt(author, next_url)
    except Exception as e:
        print(Exception)
        print(e)
예제 #2
0
#-*- coding:utf-8 -*-

import sys
import simplejson
from public.mysqlpooldao import MysqlDao
from public.redispooldao import RedisDao

redis_key = 'dianpingtest:20170104_dianping_shop_list_url'
mysql_dao = MysqlDao()
redis_dao = RedisDao()

reload(sys)
sys.setdefaultencoding('utf-8')

if __name__ == '__main__':
    sql = 'SELECT * FROM `20170104_dianping_shop_list_url` WHERE `status`=0'
    district_lists = mysql_dao.execute(sql)
    for district_list in district_lists:
        district_list_json = simplejson.dumps(district_list)
        redis_dao.rpush(redis_key, district_list_json)
        print district_list_json
예제 #3
0
        print(e)
    try:
        # 翻页
        next_name = selector.xpath('//*[@id="pagebar"]/a[last()]/text()')
        if len(next_name) > 0:
            if u'下一页' in next_name[0]:
                next_url = selector.xpath(
                    '//*[@id="pagebar"]/a[last()]/@href')[0]
                doIt(author, next_url)
    except Exception as e:
        print(Exception)
        print(e)


if __name__ == '__main__':
    mysql_dao = MysqlDao()
    while True:
        sql = 'select * from zmt_toutiaohao_url WHERE `time`=0 limit 0,1'
        ret = mysql_dao.execute(sql)
        if len(ret) == 0:
            break
        res = ret[0]
        id = res[0]
        author = res[1]
        url = res[2]
        # sql = 'update zmt_toutiaohao_url set `time`=1 where `id`=' + str(id)
        # res = mysql_dao.execute(sql)
        doIt(author, url)
    mysql_dao.close()
    print('game over')
예제 #4
0
                    sql7 = 'INSERT INTO cn163_download_info (subject,url,category_name,download_url,download_name,content) VALUES ("%s","%s","%s","%s","%s","%s")' % values
                    mysql_dao.execute(sql7)
                    print sql7

                xx_xx_xx = selector1.xpath('//text()')
                xx_aa = selector1.xpath('//a/text()')
                output = set(xx_xx_xx) - set(xx_aa)
                for x in output:
                    abs = re.findall('(.*mp4|.*mkv|.*720P)', x)
                    abs_url = ''
                    if abs:
                        for abs_single in abs:
                            abs_single_text = abs_single.replace('|',
                                                                 '').replace(
                                                                     u'(', '')
                            # print abs_url, abs_single_text, content
                            values = (subject, url, category_name, abs_url,
                                      abs_single_text, content)
                            sql8 = 'INSERT INTO cn163_download_info (subject,url,category_name,download_url,download_name,content) VALUES ("%s","%s","%s","%s","%s","%s")' % values
                            mysql_dao.execute(sql8)
                            print sql8


if __name__ == '__main__':
    sql = 'SELECT `subject`,`single_url`,`category_name` FROM all_thread_list'
    res = mysql_dao.execute(sql)
    # print res
    for (subject, url, category_name) in res:
        print subject, url, category_name
        get_download_info(subject, url, category_name)
                    except:
                        print('error')
        else:
            print(req.status_code)
        page = page - 1


if __name__ == '__main__':
    while True:
        district_list_json = redis_dao.lpop(redis_key)
        if district_list_json is None:
            break
        district_list = simplejson.loads(district_list_json)
        list_id = district_list[0]
        city_name = district_list[1]
        district_name = district_list[2]
        category_name = district_list[3]
        list_url = district_list[4]
        print list_id, city_name, district_name, category_name, list_url
        last_page = get_last_page(list_url)
        try:
            get_shop_info(list_url, last_page, city_name, district_name,
                          category_name, list_id)
        except Exception as e:
            traceback.print_exc()
            print(e)
            continue
        sql = 'UPDATE `20170104_dianping_shop_list_url` SET `status`="1" WHERE (`id`="%s")' % list_id
        print(sql)
        mysql_dao.execute(sql)
    conn.commit()
    for (id, objid, url, title) in res:
        id = id
        objid = objid
        title = title
        base_news_url = url
        comment_url = 'http://reply.autohome.com.cn/api/comments/show.json?count=50&page=1&id=' + objid + '&appid=1&datatype=jsonp&order=0&replyid=0'
        res = requests.get(comment_url)
        if res.status_code == 200:
            wb_data = res.content
            wb_data_qunull = wb_data.replace('null', '0')
            req = eval(wb_data_qunull)
            comment_count = int(req['commentcountall'])
            if comment_count % 50 != 0:
                totalpage = comment_count / 50 + 1
            else:
                totalpage = comment_count / 50
            pagenum = totalpage
            if pagenum >= 2:
                lastpage = 2
            else:
                lastpage = pagenum
            try:
                get_commentInfo(objid, base_news_url, title, lastpage)
            except:
                print 'error'
            else:
                sql2 = 'UPDATE `xcar_news_post_20170504` SET `status_comment`="0" WHERE (`id`="%s")' % id
                print(sql2)
                mysql_dao.execute(sql2)
    res = cur.fetchall()
    conn.commit()
    for (id, objid, url, title, picture_if) in res:
        print url
        id = id
        objid = objid
        title = title
        picture_if = picture_if
        news_url = url
        print picture_if

        if picture_if == '0':
            try:
                get_normalnews(objid, news_url, title, picture_if)
            except:
                print 'error'
            else:
                sql3 = 'UPDATE `xcar_news_post_20170504` SET `status_catch_picture`="0" WHERE (`id`="%s")' % id
                print(sql3)
                mysql_dao.execute(sql3)

        else:
            try:
                get_picturenews(objid, news_url, title, picture_if)
            except:
                print 'error'
            else:
                sql4 = 'UPDATE `xcar_news_post_20170504` SET `status_catch_picture`="0" WHERE (`id`="%s")' % id
                print(sql4)
                mysql_dao.execute(sql4)
예제 #8
0
# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import simplejson
from public.mysqlpooldao import MysqlDao
from public.redispooldao import RedisDao

redis_key = 'gaode:20170209_gaode_dianping_sectionl'
mysql_dao = MysqlDao()
redis_dao = RedisDao()

if __name__ == '__main__':
    sql = 'SELECT * FROM `a_gaode_section_longitude_latitude` WHERE `status`=0'
    section_lists = mysql_dao.execute(sql)
    # print section_lists
    for section_list in section_lists:
        section_list_json = simplejson.dumps(section_list)
        print section_list_json
        redis_dao.rpush(redis_key, section_list_json)
        print(section_list_json)
예제 #9
0
        if judge_if:
            # print u'正常页面'
            nextpage_node = selector.xpath(
                '//div[@class="area article"]/div[@class="page"]/span[@class="page-item-info"]/text()'
            )
            if nextpage_node:
                lastpage = int(nextpage_node[0].replace(u'共',
                                                        '').replace(u'页', ''))
                try:
                    get_newsPostInfo(objid, url, lastpage, title, publish_date)
                except:
                    print "error"
                else:
                    sql3 = 'UPDATE `xcar_news_thread_20170503` SET `status`="0" WHERE (`id`="%s")' % id
                    print(sql3)
                    mysql_dao.execute(sql3)

            else:
                lastpage = 1
                try:
                    get_newsPostInfo(objid, url, lastpage, title, publish_date)
                except:
                    print "error"
                else:
                    sql1 = 'UPDATE `xcar_news_thread_20170503` SET `status`="0" WHERE (`id`="%s")' % id
                    print(sql1)
                    mysql_dao.execute(sql1)

        else:
            print u"图片新闻"
            # print url
예제 #10
0
    def run(self):
        mysql_dao = MysqlDao()
        redis_dao = RedisDao()
        while True:
            print(self.getName())
            date = time.strftime('%Y%m%d')
            data_json = redis_dao.lpop('queue:toutiao_%s' % date)
            if data_json == None:
                break
            data = simplejson.loads(data_json)
            category_id = data['category_id']
            url = data['url']
            img_main = data['img_main']
            author = data['author']
            try:
                headers = Headers.get_headers()
                proxies = Proxies.get_proxies()
                html = requests.get(url, headers=headers, timeout=30, proxies=proxies).content
                selector = etree.HTML(html)
                status = selector.xpath('//*[@id="aboutus"]/div[1]/span[1]/text()')
                if len(status) > 0:
                    if u'今日头条' in status[0]:
                        category_names = selector.xpath('//div[@class="curpos"]/a[2]/text()')
                        if len(category_names) != 0:
                            category_name = category_names[0]
                            if u'图片' in category_name and u'视频' in category_name:
                                pass
                            else:
                                if category_id != 0:
                                    toutiaohao_authors = selector.xpath('//*[contains(@class,"gc_name")]/text()')
                                    toutiaohao_urls = selector.xpath('//*[contains(@class,"gc_name")]/@href')
                                    try:
                                        toutiaohao_num = 0
                                        for toutiaohao_url in toutiaohao_urls:
                                            toutiaohao_sql = 'insert ignore into zmt_toutiaohao_url (`author`,`url`) values ("' + \
                                                             toutiaohao_authors[toutiaohao_num] + '","' + \
                                                             toutiaohao_urls[
                                                                 toutiaohao_num] + '")'
                                            toutiaohao_num = toutiaohao_num + 1
                                            mysql_dao.execute(toutiaohao_sql)
                                    except Exception as e:
                                        print(Exception)
                                        print(e)

                                    title = selector.xpath('//*[@class="title"]/text()')
                                    if len(title) > 0:
                                        title_t = title[0].replace('"', '')
                                    else:
                                        title_t = ''
                                    content = selector.xpath('//*[@class="article-content"]/descendant::text()')
                                    img = selector.xpath('//img[@onerror="javascript:errorimg.call(this);"]/@src')
                                    content_str = ''
                                    img_str = ''
                                    for c in content:
                                        content_str = content_str + '{ycontent}' + c.replace('"', '')
                                    for img_i in img:
                                        img_str = img_str + '{yimg}' + img_i.replace('"', '')
                                    time_now = time.strftime('%Y-%m-%d %H:%M:%S')
                                    time_ts = selector.xpath('//*[@class="time"]/text()')
                                    if len(time_ts) > 0:
                                        time_t = time_ts[0].replace('"', '')
                                    else:
                                        time_t = ''
                                    insert_value = '"' + str(
                                            category_id) + '","' + title_t + '","' + content_str + '","' + url + '","' + img_main + '","","' + img_str + '","","' + author + '","' + time_t + '","' + time_now + '","' + time_now + '"';
                                    sql = 'insert ignore into zmt_content (`category_id`,`title`,`content`,`url`,`img_main`,`img_main_oss`,`img`,`img_oss`,`author`,`time`,`created_at`,`updated_at`) values (' + insert_value + ')'
                                    print(sql)
                                    if content_str != '':
                                        mysql_dao.execute(sql)
            except Exception as e:
                print(Exception)
                print(e)
        mysql_dao.close()
예제 #11
0
def get_keywords():
    mysql_dao = MysqlDao()
    sql = 'select `names_chn`,`names_eng`,`names_nick`,`directors`,`writers`,`casts`FROM bttiantang_content'
    res = mysql_dao.execute(sql)
    return res
예제 #12
0
import json
import datetime
from public.mysqlpooldao import MysqlDao
from public.redispooldao import RedisDao

redis_key = 'gaode:20170214_gaode_shop_info'
mysql_dao = MysqlDao()
redis_dao = RedisDao()


class CJsonEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, datetime.datetime):
            return obj.strftime('%Y-%m-%d %H:%M:%S')
        elif isinstance(obj, datetime.date):
            return obj.strftime('%Y-%m-%d')
        else:
            return json.JSONEncoder.default(self, obj)


if __name__ == '__main__':
    sql = 'SELECT * FROM `c_gaode_dianping_shop_info` WHERE `status`=0 limit 100000'
    print sql
    section_lists = list(mysql_dao.execute(sql))
    random.shuffle(section_lists)
    for section_list in section_lists:
        section_list_json = json.dumps(section_list, cls=CJsonEncoder)
        # print section_list_json
        redis_dao.rpush(redis_key, section_list_json)
        print(section_list_json)