Exemplo n.º 1
0
def parse_leftsecond(leftsecond_node):
    max_leftsec_num = get_max_num('leftsec')
    if max_leftsec_num is None:
        max_leftsec_num = 0
    curr_num_of_lesec = max_leftsec_num + 1
    nbanews = NbaNews()
    leftsecsel = Selector(
        text=str(leftsecond_node),
        type="html",
    )
    leftsecimgsrcurl = leftsecsel.xpath('//img//@src').extract()[0].strip()

    leftsecfile_name = "leftsec_%s.jpg" % curr_num_of_lesec
    leftsecfile_path = os.path.join(
        "D:\StefanClub\StefanClub\www\static\img\sinasports", leftsecfile_name)
    urllib.request.urlretrieve(leftsecimgsrcurl, leftsecfile_path)
    nbanews["number"] = curr_num_of_lesec
    #curr_num_of_lesec = curr_num_of_lesec + 1
    nbanews["imgsrcurl"] = "../static/img/sinasports/%s" % leftsecfile_name
    nbanews["imgurl"] = None
    nbanews["isvideo"] = None
    nbanews["title"] = None
    nbanews["titleurl"] = None
    nbanews["newstime"] = None
    nbanews["comment_url"] = None
    for j in range(1, 6):
        nbanews["tag%s" % j] = None
        nbanews["tag%surl" % j] = None
    nbanews["newstype"] = 'leftsec'
    return nbanews
Exemplo n.º 2
0
def parse_zhihuhot_comment(response,hotid):
    max_comuserimg_num = get_max_num('zhihuhotcomments')
    if max_comuserimg_num is None:
        max_comuserimg_num = 0
    curr_num_of_comuser = max_comuserimg_num + 1
    #resultjson = json.loads(response)
    resultjson = response
    comments = resultjson['data']
    comment_item  = ZhihuHotComment()
    for comment in comments:
        commentid = comment['id']
        author = comment['author']
        author_member = author['member']
        userimgsrcurl = author_member['avatar_url']
        url_token = author_member['url_token']
        userimgurl = '//www.zhihu.com/people/' + url_token
        username = author_member['name']
        replytime = comment['created_time']
        replytime = datetime.datetime.fromtimestamp(replytime)
        content = comment['content']
        infavorqty = comment['vote_count']
        replytouser = None
        replytouserurl = None
        if "reply_to_author" in comment.keys():
            reply_to_author = comment['reply_to_author']
            if reply_to_author is not None:
                reply_to_author_member = reply_to_author['member']
                replytouser = reply_to_author_member['name']
                replytouser_urltoken = reply_to_author_member['url_token']
                replytouserurl = '//www.zhihu.com/people/' + replytouser_urltoken
        file_name = "zhihuhotcomuser_%s.jpg" % curr_num_of_comuser
        file_path = os.path.join("D:\StefanClub\StefanClub\www\static\img\zhihu", file_name)

        '''proxies_list = [{'http': '121.193.143.249:80'}, {'http': '192.168.1.100:80'}]
        proxies = random.choice(proxies_list)
        proxy_handler = urllib.request.ProxyHandler(proxies)
        opener = urllib.request.build_opener(proxy_handler)
        data = opener.open(userimgsrcurl).read()'''
        data = s.get(userimgsrcurl,  headers=agentheaders).content
        with open(file_path, "wb") as code:
            code.write(data)

        #urllib.request.urlretrieve(userimgsrcurl, file_path)
        comment_item["userimgsrcurl"] = "../static/img/zhihu/%s" % file_name
        comment_item["userimgnumber"] = curr_num_of_comuser
        curr_num_of_comuser = curr_num_of_comuser + 1
        comment_item["commentid"] = commentid
        comment_item["hotid"] = hotid
        comment_item["userimgurl"] = userimgurl
        comment_item["username"] = username
        comment_item["replytouser"] = replytouser
        comment_item["replytouserurl"] = replytouserurl
        comment_item["replytime"] = replytime
        comment_item["content"] = content
        comment_item["infavorqty"] = infavorqty
        if duplicate_record.process_item(comment_item) is not None:
            if redis_deduplicate.process_item(comment_item) is not None:
                inserttomysql.process_item(comment_item)
Exemplo n.º 3
0
def parse_zhihuhot_content(response,hotid,hottype):
    zhihuhot_content = ZhihuHotContent()
    soup = BeautifulSoup(response, 'lxml')
    if hottype == 'question':
        post_node = soup.select("span[class='RichText ztext CopyrightRichText-richText']")
    else:
        post_node = soup.select("div[class='RichText ztext Post-RichText']")
    d = pq(str(post_node[0]))
    children = list(d.children())
    max_contentimg_num = get_max_num('zhihucontent')
    if max_contentimg_num is None:
        max_contentimg_num = 0
    curr_num_of_content = max_contentimg_num + 1
    for i in range(len(children)):
        part_str = None
        imgurl = None
        imgnumber = None
        videourl = None
        '''parttype = None
        if children[i].tag == 'p' or children[i].tag == 'div' or children[i].tag == 'blockquote' or children[i].tag == 'ul' or children[i].tag == 'hr':
            part_str = recursive(children[i])
            parttype = 'text'
            '''
        if children[i].tag == 'a':
            videourl, part_str = get_videourl(children[i])
            if videourl is not None:
                parttype = 'video'
            else:
                parttype = 'text'
        elif children[i].tag == 'figure':
            imgurl, imgnumber = get_img_info(children[i], curr_num_of_content, s, agentheaders)
            curr_num_of_content += 1
            parttype = 'img'
        else:
            part_str = recursive(children[i])
            parttype = 'text'
        if parttype is not None:
            zhihuhot_content['hotid'] = hotid
            zhihuhot_content['partno'] = i + 1
            zhihuhot_content['parttype'] = parttype
            zhihuhot_content['imgurl'] = imgurl
            zhihuhot_content['imgnumber'] = imgnumber
            zhihuhot_content['videourl'] = videourl
            zhihuhot_content['text'] = part_str
            if duplicate_record.process_item(zhihuhot_content) is not None:
                if redis_deduplicate.process_item(zhihuhot_content) is not None:
                    inserttomysql.process_item(zhihuhot_content)
Exemplo n.º 4
0
def parse_lefttop(lefttopimg_node):
    max_lefttop_num = get_max_num('lefttop')
    if max_lefttop_num is None:
        max_lefttop_num = 0
    curr_num_of_letop = max_lefttop_num + 1
    nbanews = NbaNews()
    lefttopsel = Selector(
        text=str(lefttopimg_node),
        type="html",
    )
    lefttoptitle = lefttopsel.xpath('//h3/text()').extract()[0].strip()
    lefttopurl = lefttopsel.xpath('//a//@href').extract()[0].strip()
    lefttopimgsrcurl = lefttopsel.xpath('//img//@src').extract()[0].strip()
    lefttopisvideo = lefttopurl[2:7]
    if lefttopisvideo == 'video':
        lefttopisvideo = 'TRUE'
    else:
        lefttopisvideo = 'FALSE'
    lefttopfile_name = "lefttop_%s.jpg" % curr_num_of_letop
    lefttopfile_path = os.path.join(
        "D:\StefanClub\StefanClub\www\static\img\sinasports", lefttopfile_name)
    urllib.request.urlretrieve(lefttopimgsrcurl, lefttopfile_path)
    nbanews["number"] = curr_num_of_letop
    #curr_num_of_letop = curr_num_of_letop + 1
    nbanews["imgsrcurl"] = "../static/img/sinasports/%s" % lefttopfile_name
    nbanews["imgurl"] = lefttopurl
    nbanews["isvideo"] = lefttopisvideo
    nbanews["title"] = lefttoptitle
    nbanews["titleurl"] = None
    nbanews["newstime"] = None
    nbanews["comment_url"] = None
    for j in range(1, 6):
        nbanews["tag%s" % j] = None
        nbanews["tag%surl" % j] = None
    nbanews["newstype"] = 'lefttop'
    return nbanews
Exemplo n.º 5
0
class CsdnSpider(scrapy.Spider):
    name = 'csdn'
    allowed_domains = ["www.csdn.net"]
    start_urls = ['http://www.csdn.net/']

    index_url = 'http://www.csdn.net/'
    more_artice_url = 'https://www.csdn.net/api/articles?type=more&category=home&shown_offset={shown_offset}'

    max_index_news_num = get_max_num('index_news')
    if max_index_news_num is None:
        max_index_news_num = 0
    curr_num_of_article = max_index_news_num + 1

    max_car_number = get_max_num('index_car', 'Carousel')
    if max_car_number is None:
        max_car_number = 0
    curr_num_of_car = max_car_number + 1

    max_car_r_number = get_max_num('index_car', 'Carousel_R')
    if max_car_r_number is None:
        max_car_r_number = 0
    curr_num_of_car_r = max_car_r_number + 1

    max_right_number = get_max_num('index_car', 'Right')
    if max_right_number is None:
        max_right_number = 0
    curr_num_of_right = max_right_number + 1

    def start_requests(self):
        yield Request(self.index_url, callback=self.parse_index)
        for i in range(1, 5):
            yield Request(self.more_artice_url.format(shown_offset=21 +
                                                      (i - 1) * 10),
                          callback=self.parse_more_index_art)

    def parse_index(self, response):
        carousel_item = IndexCarouselItem()
        index_news_item = IndexNews()
        soup = BeautifulSoup(response.text, 'lxml')
        post_nodes = soup.select(".carousel-inner .csdn-tracking-statistics")
        post_nodes1 = soup.select(".carousel-right .carousel-right-u")
        post_nodes2 = soup.select(".company_list li")
        post_nodes3 = soup.select(
            ".feedlist_mod li[class='clearfix'] div[class='list_con']")
        for post_node in post_nodes:
            sel = Selector(
                text=str(post_node),
                type="html",
            )
            title = sel.xpath('//div[@class="carousel-caption"]/text()'
                              ).extract()[0].strip()
            url = sel.xpath('//a//@href').extract()[0].strip()
            img_url = sel.xpath('//img//@src').extract()[0].strip()
            file_name = "carousel_%s.jpg" % (self.curr_num_of_car)
            file_path = os.path.join(
                "D:\StefanClub\StefanClub\www\static\img\csdn", file_name)
            urllib.request.urlretrieve(img_url, file_path)

            carousel_item["number"] = self.curr_num_of_car
            self.curr_num_of_car = self.curr_num_of_car + 1
            carousel_item["title"] = title
            carousel_item["url"] = url
            #carousel_item["img_url"] = img_url
            carousel_item["img_url"] = "../static/img/csdn/%s" % (file_name)
            carousel_item["item_class"] = "Carousel"
            yield carousel_item

        for post_node1 in post_nodes1:
            sel1 = Selector(
                text=str(post_node1),
                type="html",
            )
            title1 = sel1.xpath(
                '//p[@class="carousel-right-caption"]/span/text()').extract(
                )[0].strip()
            url1 = sel1.xpath('//a//@href').extract()[0].strip()
            img_url1 = sel1.xpath('//img//@src').extract()[0].strip()
            file_name1 = "carousel_right_%s.jpg" % (self.curr_num_of_car_r)
            file_path1 = os.path.join(
                "D:\StefanClub\StefanClub\www\static\img\csdn", file_name1)
            urllib.request.urlretrieve(img_url1, file_path1)

            carousel_item["number"] = self.curr_num_of_car_r
            self.curr_num_of_car_r = self.curr_num_of_car_r + 1
            carousel_item["title"] = title1
            carousel_item["url"] = url1
            #carousel_item["img_url"] = img_url
            carousel_item["img_url"] = "../static/img/csdn/%s" % (file_name1)
            carousel_item["item_class"] = "Carousel_R"
            yield carousel_item

        for post_node2 in post_nodes2:
            sel2 = Selector(
                text=str(post_node2),
                type="html",
            )
            title2 = sel2.xpath('//h3/a/text()').extract()[0].strip()
            url2 = sel2.xpath('//h3/a//@href').extract()[0].strip()
            img_url2 = sel2.xpath('//img//@src').extract()[0].strip()
            file_name2 = "right_%s.jpg" % (self.curr_num_of_right)
            file_path2 = os.path.join(
                "D:\StefanClub\StefanClub\www\static\img\csdn", file_name2)
            urllib.request.urlretrieve(img_url2, file_path2)

            carousel_item["number"] = self.curr_num_of_right
            self.curr_num_of_right = self.curr_num_of_right + 1
            carousel_item["title"] = title2
            carousel_item["url"] = url2
            # carousel_item["img_url"] = img_url
            carousel_item["img_url"] = "../static/img/csdn/%s" % (file_name2)
            carousel_item["item_class"] = "Right"
            yield carousel_item

        for post_node3 in post_nodes3:
            sel3 = Selector(
                text=str(post_node3),
                type="html",
            )
            index_news_item["close_target_id"] = "myModal_%s" % (
                self.curr_num_of_article)
            index_news_item["close_target_id_ref"] = "#myModal_%s" % (
                self.curr_num_of_article)
            title3 = sel3.xpath(
                '//div[@class="title"]/h2/a/text()').extract()[0].strip()
            url3 = sel3.xpath(
                '//div[@class="title"]/h2/a//@href').extract()[0].strip()
            news_summary = sel3.xpath(
                '//div[@class="summary oneline"]/text()').extract()[0].strip()
            user_url = sel3.xpath('//dt/a//@href').extract()[0].strip()
            user_img_url = sel3.xpath('//dt/a/img//@src').extract()[0].strip()
            user_name = sel3.xpath(
                '//dd[@class="name"]/a/text()').extract()[0].strip()
            news_date = sel3.xpath(
                '//dd[@class="time"]/text()').extract()[0].strip()
            label_url = ''
            news_label = ''
            news_reads = '0'
            news_comments = '0'
            label_list = sel3.xpath('//dd[@class="tag"]/a//@href')
            if len(label_list) > 0:
                label_url = sel3.xpath(
                    '//dd[@class="tag"]/a//@href').extract()[0].strip()

            label_list2 = sel3.xpath('//dd[@class="tag"]/a/text()')
            if len(label_list2) > 0:
                news_label = sel3.xpath(
                    '//dd[@class="tag"]/a/text()').extract()[0].strip()

            reads_num_list = sel3.xpath(
                '//dd[@class="read_num"]/a/span[@class="num"]/text()')
            if len(reads_num_list) > 0:
                news_reads = sel3.xpath(
                    '//dd[@class="read_num"]/a/span[@class="num"]/text()'
                ).extract()[0].strip()
            comment_url = sel3.xpath(
                '//dd[@class="common_num "]/a//@href').extract()[0].strip()
            comment_num_list = sel3.xpath(
                '//dd[@class="common_num "]/a/span[@class="num"]/text()')
            if len(comment_num_list) > 0:
                news_comments = sel3.xpath(
                    '//dd[@class="common_num "]/a/span[@class="num"]/text()'
                ).extract()[0].strip()

            file_name3 = "userimg_%s.jpg" % (self.curr_num_of_article)
            index_news_item["number"] = self.curr_num_of_article
            self.curr_num_of_article = self.curr_num_of_article + 1
            file_path3 = os.path.join(
                "D:\StefanClub\StefanClub\www\static\img\csdn", file_name3)
            urllib.request.urlretrieve(user_img_url, file_path3)

            index_news_item["title"] = title3
            index_news_item["url"] = url3
            index_news_item["news_summary"] = news_summary
            index_news_item["user_img_url"] = "../static/img/csdn/%s" % (
                file_name3)
            index_news_item["user_name"] = user_name
            index_news_item["user_url"] = user_url
            index_news_item["news_date"] = news_date
            index_news_item["label_url"] = label_url
            index_news_item["news_label"] = news_label
            index_news_item["news_reads"] = int(news_reads)
            index_news_item["comment_url"] = comment_url
            index_news_item["news_comments"] = int(news_comments)

            yield index_news_item

    def parse_more_index_art(self, response):
        resultjson = json.loads(response.body)
        articles = resultjson['articles']
        index_news_item = IndexNews()
        for article in articles:
            title = article['title']
            url = article['url']
            news_summary = article['summary']
            user_img_url = article['avatar']
            close_target_id = "myModal_%s" % (self.curr_num_of_article)
            close_target_id_ref = "#myModal_%s" % (self.curr_num_of_article)
            file_name = "userimg_%s.jpg" % (self.curr_num_of_article)
            index_news_item["number"] = self.curr_num_of_article
            self.curr_num_of_article = self.curr_num_of_article + 1
            file_path = os.path.join(
                "D:\StefanClub\StefanClub\www\static\img\csdn", file_name)
            urllib.request.urlretrieve(user_img_url, file_path)
            user_name = article['user_name']
            user_url = article['user_url']
            news_date = article['created_at']
            news_label = article['category']
            label_url = "/nav/%s" % (article['category_id'])
            news_reads = article['views']
            news_comments = article['comments']
            comment_url = "%s#comment_form" % (article['url'])

            index_news_item["close_target_id"] = close_target_id
            index_news_item["close_target_id_ref"] = close_target_id_ref
            index_news_item["title"] = title
            index_news_item["url"] = url
            index_news_item["news_summary"] = news_summary
            index_news_item["user_img_url"] = "../static/img/csdn/%s" % (
                file_name)
            index_news_item["user_name"] = user_name
            index_news_item["user_url"] = user_url
            index_news_item["news_date"] = news_date
            index_news_item["label_url"] = label_url
            index_news_item["news_label"] = news_label
            index_news_item["news_reads"] = news_reads
            index_news_item["comment_url"] = comment_url
            index_news_item["news_comments"] = news_comments

            yield index_news_item
Exemplo n.º 6
0
def parse_main(response):
    max_userimg_num = get_max_num('zhihuhotuser')
    if max_userimg_num is None:
        max_userimg_num = 0
    curr_num_of_usim = max_userimg_num + 1

    max_newsimg_num = get_max_num('zhihuhotnews')
    if max_newsimg_num is None:
        max_newsimg_num = 0
    curr_num_of_neim = max_newsimg_num + 1
    zhihuhot = ZhihuHot()
    soup = BeautifulSoup(response, 'lxml')
    post_nodes = soup.select("div[class='Card TopstoryItem TopstoryItem-isRecommend']")
    for post_node in post_nodes:
        sel = Selector(text=str(post_node), type="html", )
        '''feedsourceurl = sel.xpath('//a[@class="TopicLink"]//@href').extract()[0].strip()
        feedsourcetags = sel.xpath('//div[@aria-haspopup="true"]/text()')
        if len(feedsourcetags) > 0:
            feedsourcetag = sel.xpath('//div[@aria-haspopup="true"]/text()').extract()[0].strip()
        else:
            feedsourcetag =  None
        userimgsrcurl = sel.xpath('//img[@class="Avatar AuthorInfo-avatar"]//@src').extract()[0].strip()
        userimgurls = sel.xpath('//a[@class="UserLink-link"]//@href')
        if len(userimgurls) > 0:
            userimgurl = sel.xpath('//a[@class="UserLink-link"]//@href').extract()[0].strip()
        else:
            userimgurl = None
        usernames1 = sel.xpath('//a[@class="UserLink-link"]/text()')
        usernames2 = sel.xpath('//span[@class="UserLink AuthorInfo-name"]/text()')
        if len(usernames1) > 0:
            username = sel.xpath('//a[@class="UserLink-link"]/text()').extract()[0].strip()
        elif len(usernames2) > 0:
            username = sel.xpath('//span[@class="UserLink AuthorInfo-name"]/text()').extract()[0].strip()
        else:
            username = None
        userinfolist = sel.xpath('//div[@class="AuthorInfo-detail"]/div/div/text()')
        if len(userinfolist) > 0:
            userinfo = sel.xpath('//div[@class="AuthorInfo-detail"]/div/div/text()').extract()[0].strip()
        else:
            userinfo = None'''    # zhihu has removed the feedsource and authorinfo

        newsimg = sel.xpath('//div[@class="RichContent-cover-inner"]/img//@src')
        newsimg2 = sel.xpath('//div[@class="RichContent-cover-inner"]/div//@data-src')
        if len(newsimg) > 0:
            newsimgsrcurl = sel.xpath('//div[@class="RichContent-cover-inner"]/img//@src').extract()[0].strip()
        elif len(newsimg2) > 0:
            newsimgsrcurl = sel.xpath('//div[@class="RichContent-cover-inner"]/div//@data-src').extract()[0].strip()
        else:
            newsimgsrcurl = None
        if newsimgsrcurl is None:
            zhihuhot["newsimgsrcurl"] = None
            zhihuhot["newsimgnumber"] = None
        else:
            file_name1 = "zhihuhotnews_%s.jpg" % curr_num_of_neim
            file_path1 = os.path.join("D:\StefanClub\StefanClub\www\static\img\zhihu", file_name1)
            urllib.request.urlretrieve(newsimgsrcurl, file_path1)
            zhihuhot["newsimgsrcurl"] = "../static/img/zhihu/%s" % file_name1
            zhihuhot["newsimgnumber"] = curr_num_of_neim
            curr_num_of_neim = curr_num_of_neim + 1

        hasvideo = sel.xpath('//div[@class="RichContent-cover-play"]')
        if len(hasvideo) > 0:
            isvideo = 'TRUE'
        else:
            isvideo = 'FALSE'
        title1 = sel.xpath('//h2[@class="ContentItem-title"]/div/a')
        title2 = sel.xpath('//h2[@class="ContentItem-title"]/a')
        if len(title1) > 0:
            title = sel.xpath('//h2[@class="ContentItem-title"]/div/a/text()').extract()[0].strip()
            titleurl = sel.xpath('//h2[@class="ContentItem-title"]/div/a//@href').extract()[0].strip()
        elif len(title2) > 0:
            title = sel.xpath('//h2[@class="ContentItem-title"]/a/text()').extract()[0].strip()
            titleurl = sel.xpath('//h2[@class="ContentItem-title"]/a//@href').extract()[0].strip()
        else:
            title = 'Empty title,It will be dropped by redis control except the first one'
            titleurl = None
        hottype = None
        if titleurl is not None:
            if titleurl[1:9] == 'question':
                titleurl = '//www.zhihu.com' + titleurl
                hottype = 'question'
            if titleurl[2:10] == 'zhuanlan':
                hottype = 'zhuanlan'
        hotid = None
        if titleurl is not None:
            hotid = get_zhihu_hotid(titleurl)
        newscontent = sel.xpath('//span[@class="RichText ztext CopyrightRichText-richText"]/text()').extract()[0].strip()
        infavorqty1 = sel.xpath('//button[@class="Button VoteButton VoteButton--up"]/text()').extract()[0].strip()
        infavorqty2 = ''
        infavorqty2list = sel.xpath('//button[@class="Button VoteButton VoteButton--up"]/text()')
        if len(infavorqty2list) > 1:
             infavorqty2 = sel.xpath('//button[@class="Button VoteButton VoteButton--up"]/text()').extract()[1].strip()
        infavorqty = infavorqty1 + infavorqty2
        infavorqty_list = list(infavorqty)
        infavorqty_list.insert(2, " ")
        infavorqty = "".join(infavorqty_list)
        comment_title = sel.xpath('//button[@class="Button ContentItem-action Button--plain Button--withIcon Button--withLabel"]/text()').extract()[0].strip()
        comment_qty = get_comment_qty(comment_title)
        comment_page = comment_qty // 20 + (1 if comment_qty % 20 > 0 else 0)
        '''file_name = "zhihuhotuser_%s.jpg" % curr_num_of_usim
        file_path = os.path.join("D:\StefanClub\StefanClub\www\static\img\zhihu", file_name)
        urllib.request.urlretrieve(userimgsrcurl, file_path)
        zhihuhot["userimgsrcurl"] = "../static/img/zhihu/%s" % file_name
        zhihuhot["userimgnumber"] = curr_num_of_usim
        curr_num_of_usim = curr_num_of_usim + 1'''  # zhihu has removed the feedsource and authorinfo
        zhihuhot["userimgsrcurl"] = None# zhihu has removed the feedsource and authorinfo
        zhihuhot["userimgnumber"] = None# zhihu has removed the feedsource and authorinfo
        zhihuhot["feedsourcetag"] = None # feedsourcetag
        zhihuhot["feedsourceurl"] = None #feedsourceurl
        zhihuhot["userimgurl"] = None #userimgurl
        zhihuhot["username"] = None #username
        zhihuhot["userinfo"] = None #userinfo
        zhihuhot["newsimgurl"] = None
        zhihuhot["isvideo"] = isvideo
        zhihuhot["title"] = title
        zhihuhot["titleurl"] = titleurl
        zhihuhot["hotid"] = hotid
        zhihuhot["newscontent"] = newscontent
        zhihuhot["infavorqty"] = infavorqty
        zhihuhot["comment_url"] = None
        zhihuhot["comment_title"] = comment_title
        zhihuhot["share_url"] = None

        if hotid is not None:
            if duplicate_record.process_item(zhihuhot) is not None:
                if redis_deduplicate.process_item(zhihuhot) is not None:
                    if inserttomysql.process_item(zhihuhot) == 'toinsert':
                        if hottype == 'question':
                            for i in range(comment_page):
                                # html = s.get(answer_comment_url.format(hotid=hotid, offset=i * 20), headers=jsonheaders).text
                                html = s.get(answer_comment_url.format(hotid=hotid, offset=i * 20), headers=jsonheaders)
                                #result = json.dumps(html.json(), ensure_ascii=False)
                                result = html.json()
                                # parse_zhihuhot_comment(html,hotid)
                                parse_zhihuhot_comment(result, hotid)
                        elif hottype == 'zhuanlan':
                            for i in range(comment_page):
                                html = s.get(zhuanlan_comment_url.format(hotid=hotid, offset=i * 20), headers=jsonheaders)
                                result = html.json()
                                parse_zhihuhot_comment(result, hotid)
                        contenturl = 'https:' + titleurl
                        mainhtml = s.get(contenturl).text
                        parse_zhihuhot_content(mainhtml,hotid,hottype)
Exemplo n.º 7
0
class ZhihuSpider(scrapy.Spider):
    name = 'zhihu'
    allowed_domains = ['www.zhihu.com']
    start_urls = ['https://www.zhihu.com']

    zhuanlan_comment_url = 'https://www.zhihu.com/api/v4/articles/{hotid}/comments?include=data%5B*%5D.author%2Ccollapsed%2Creply_to_author%2Cdisliked%2Ccontent%2Cvoting%2Cvote_count%2Cis_parent_author%2Cis_author%2Calgorithm_right&order=normal&limit=20&offset={offset}&status=open'
    answer_comment_url = 'https://www.zhihu.com/api/v4/answers/{hotid}/comments?include=data%5B*%5D.author%2Ccollapsed%2Creply_to_author%2Cdisliked%2Ccontent%2Cvoting%2Cvote_count%2Cis_parent_author%2Cis_author%2Calgorithm_right&order=normal&limit=20&offset={offset}&status=open'
    # headers = {'User-Agent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",}
    headers = {}
    max_userimg_num = get_max_num('zhihuhotuser')
    if max_userimg_num is None:
        max_userimg_num = 0
    curr_num_of_usim = max_userimg_num + 1

    max_newsimg_num = get_max_num('zhihuhotnews')
    if max_newsimg_num is None:
        max_newsimg_num = 0
    curr_num_of_neim = max_newsimg_num + 1

    max_comuserimg_num = get_max_num('zhihuhotcomments')
    if max_comuserimg_num is None:
        max_comuserimg_num = 0
    curr_num_of_comuser = max_comuserimg_num + 1

    def start_requests(self):
        #display = Display(visible=0, size=(800, 600))
        #display.start()
        s = requests.Session()
        s.headers.clear()
        browser = webdriver.Chrome()
        '''browser2 = webdriver.Firefox()
        browser2.get("https://www.zhihu.com/signin")
        browser2.find_element_by_css_selector(".SignFlow-accountInput.Input-wrapper input").send_keys("13818248346")
        browser2.find_element_by_css_selector(".SignFlow-password input").send_keys("kaihua1010")
        browser2.find_element_by_css_selector(".Button.SignFlow-submitButton").click()
        time.sleep(5)
        Cookies2 = browser2.get_cookies()'''

        browser.get("https://www.zhihu.com/signin")
        browser.find_element_by_css_selector(
            ".SignFlow-accountInput.Input-wrapper input").send_keys(
                "13818248346")
        browser.find_element_by_css_selector(
            ".SignFlow-password input").send_keys("kaihua1010")
        browser.find_element_by_css_selector(
            ".Button.SignFlow-submitButton").click()
        time.sleep(10)
        Cookies = browser.get_cookies()
        cookie_dict = {}
        '''cookie = [item["name"] + "=" + item["value"] for item in Cookies]
        cookiestr = '; '.join(item for item in cookie)
        self.headers['cookie'] = cookiestr'''
        for cookie in Cookies:
            cookie_dict[cookie['name']] = cookie['value']
            s.cookies.set(cookie['name'], cookie['value'])

        browser.close()
        html2 = s.get(
            'https://www.zhihu.com/question/263892920/answer/405697336').text
        print(html2)
        html3 = s.get(self.start_urls[0]).text
        print(html3)
        self.headers = s.headers
        #display.stop()
        '''cookie_dict = {}
        cookie_dict['_xsrf'] = 'a1807a2e-e8da-4464-bcc0-b11be259f42b'
        cookie_dict['_zap'] = '1e5ccdb9-7860-466f-9a59-ff9cb19e072d'
        cookie_dict['d_c0'] = 'ALAnFykIXg6PTtXBbHxOihH1-UmQKy8guOQ=|1539608541'
        cookie_dict['capsion_ticket'] = '2|1:0|10:1539790364|14:capsion_ticket|44:NWIxZTRlYjUxOTg3NGI5MjgwODBhZjYwNmEwNTFhYTI=|4c1810d7fdf17461da2dc94a82756002cab3899386af9fef230ec3efd8f410f9'
        cookie_dict['z_c0'] = '2|1:0|10:1539790372|4:z_c0|92:Mi4xV2ZvX0F3QUFBQUFBc0NjWEtRaGVEaVlBQUFCZ0FsVk5KS1MwWEFEXy0wZTFkX1I3SjhwTlRTSnUxSDRQajhfcHdR|a100b4ed646d6454be56a0be8f3257fddb1ead83eabe52056221759123a6d005'
        cookie_dict['q_c1'] = '1d154dfb0c3c49b5afcdf6c78fc70148|1539608544000|1539608544000'
        cookie_dict['tgw_l7_route'] = '170010e948f1b2a2d4c7f3737c85e98c'
        cookie_dict['__utma'] = '51854390.302433648.1539609652.1539609652.1539609652.1'
        cookie_dict['__utmc'] = '51854390'
        cookie_dict['__utmz'] = '51854390.1539609652.1.1.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/hot'
        cookie_dict['__utmv'] = '51854390.100--|2=registration_date=20160715=1^3=entry_date=20160715=1'
        cookie_dict['tst'] = 'r'
        cookie_dict['__gads'] = 'ID=fa1c97a341ad2943:T=1539696957:S=ALNI_MZJ7ws-b5ObURSAQlBAGi8pbTmD6g'
'''
        yield Request(
            url=self.start_urls[0],
            dont_filter=True,
            meta={"cookies": cookie_dict},
            cookies=cookie_dict,
            callback=self.parse_main,
        )
        #yield Request(url=self.start_urls[0], dont_filter=True, meta={"cookies": cookie_dict}, cookies=cookie_dict,callback=self.parse_main, )
        #yield Request(url=self.start_urls[0], dont_filter=True, headers=self.headers, callback=self.parse_main)

    def parse_main(self, response):
        zhihuhot = ZhihuHot()
        cookie_dict = response.meta.get("cookies", "")
        soup = BeautifulSoup(response.text, 'lxml')
        post_nodes = soup.select("div[class='Card TopstoryItem']")
        for post_node in post_nodes:
            sel = Selector(
                text=str(post_node),
                type="html",
            )
            feedsourceurl = sel.xpath(
                '//a[@class="TopicLink"]//@href').extract()[0].strip()
            feedsourcetags = sel.xpath('//div[@aria-haspopup="true"]/text()')
            if len(feedsourcetags) > 0:
                feedsourcetag = sel.xpath('//div[@aria-haspopup="true"]/text()'
                                          ).extract()[0].strip()
            else:
                feedsourcetag = None
            userimgsrcurl = sel.xpath(
                '//img[@class="Avatar AuthorInfo-avatar"]//@src').extract(
                )[0].strip()
            userimgurls = sel.xpath('//a[@class="UserLink-link"]//@href')
            if len(userimgurls) > 0:
                userimgurl = sel.xpath(
                    '//a[@class="UserLink-link"]//@href').extract()[0].strip()
            else:
                userimgurl = None
            usernames1 = sel.xpath('//a[@class="UserLink-link"]/text()')
            usernames2 = sel.xpath(
                '//span[@class="UserLink AuthorInfo-name"]/text()')
            if len(usernames1) > 0:
                username = sel.xpath(
                    '//a[@class="UserLink-link"]/text()').extract()[0].strip()
            elif len(usernames2) > 0:
                username = sel.xpath(
                    '//span[@class="UserLink AuthorInfo-name"]/text()'
                ).extract()[0].strip()
            else:
                username = None
            userinfolist = sel.xpath(
                '//div[@class="AuthorInfo-detail"]/div/div/text()')
            if len(userinfolist) > 0:
                userinfo = sel.xpath(
                    '//div[@class="AuthorInfo-detail"]/div/div/text()'
                ).extract()[0].strip()
            else:
                userinfo = None

            newsimg = sel.xpath(
                '//div[@class="RichContent-cover-inner"]/img//@src')
            newsimg2 = sel.xpath(
                '//div[@class="RichContent-cover-inner"]/div//@data-src')
            if len(newsimg) > 0:
                newsimgsrcurl = sel.xpath(
                    '//div[@class="RichContent-cover-inner"]/img//@src'
                ).extract()[0].strip()
            elif len(newsimg2) > 0:
                newsimgsrcurl = sel.xpath(
                    '//div[@class="RichContent-cover-inner"]/div//@data-src'
                ).extract()[0].strip()
            else:
                newsimgsrcurl = None
            if newsimgsrcurl is None:
                zhihuhot["newsimgsrcurl"] = None
                zhihuhot["newsimgnumber"] = None
            else:
                file_name1 = "zhihuhotnews_%s.jpg" % self.curr_num_of_neim
                file_path1 = os.path.join(
                    "D:\StefanClub\StefanClub\www\static\img\zhihu",
                    file_name1)
                urllib.request.urlretrieve(newsimgsrcurl, file_path1)
                zhihuhot[
                    "newsimgsrcurl"] = "../static/img/zhihu/%s" % file_name1
                zhihuhot["newsimgnumber"] = self.curr_num_of_neim
                self.curr_num_of_neim = self.curr_num_of_neim + 1

            hasvideo = sel.xpath('//div[@class="RichContent-cover-play"]')
            if len(hasvideo) > 0:
                isvideo = 'TRUE'
            else:
                isvideo = 'FALSE'
            title1 = sel.xpath('//h2[@class="ContentItem-title"]/div/a')
            title2 = sel.xpath('//h2[@class="ContentItem-title"]/a')
            if len(title1) > 0:
                title = sel.xpath(
                    '//h2[@class="ContentItem-title"]/div/a/text()').extract(
                    )[0].strip()
                titleurl = sel.xpath(
                    '//h2[@class="ContentItem-title"]/div/a//@href').extract(
                    )[0].strip()
            elif len(title2) > 0:
                title = sel.xpath('//h2[@class="ContentItem-title"]/a/text()'
                                  ).extract()[0].strip()
                titleurl = sel.xpath(
                    '//h2[@class="ContentItem-title"]/a//@href').extract(
                    )[0].strip()
            else:
                title = 'Empty title,It will be dropped by redis control except the first one'
                titleurl = None
            hottype = None
            if titleurl is not None:
                if titleurl[1:9] == 'question':
                    titleurl = '//www.zhihu.com' + titleurl
                    hottype = 'question'
                if titleurl[2:10] == 'zhuanlan':
                    hottype = 'zhuanlan'
            hotid = None
            if titleurl is not None:
                hotid = get_zhihu_hotid(titleurl)
            newscontent = sel.xpath(
                '//span[@class="RichText ztext CopyrightRichText-richText"]/text()'
            ).extract()[0].strip()
            infavorqty1 = sel.xpath(
                '//button[@class="Button VoteButton VoteButton--up"]/text()'
            ).extract()[0].strip()
            infavorqty2 = sel.xpath(
                '//button[@class="Button VoteButton VoteButton--up"]/text()'
            ).extract()[1].strip()
            infavorqty = infavorqty1 + infavorqty2
            infavorqty_list = list(infavorqty)
            infavorqty_list.insert(2, " ")
            infavorqty = "".join(infavorqty_list)
            comment_title = sel.xpath(
                '//button[@class="Button ContentItem-action Button--plain Button--withIcon Button--withLabel"]/text()'
            ).extract()[0].strip()
            comment_qty = get_comment_qty(comment_title)
            comment_page = comment_qty // 20 + (1
                                                if comment_qty % 20 > 0 else 0)
            file_name = "zhihuhotuser_%s.jpg" % self.curr_num_of_usim
            file_path = os.path.join(
                "D:\StefanClub\StefanClub\www\static\img\zhihu", file_name)
            urllib.request.urlretrieve(userimgsrcurl, file_path)
            zhihuhot["userimgsrcurl"] = "../static/img/zhihu/%s" % file_name
            zhihuhot["userimgnumber"] = self.curr_num_of_usim
            self.curr_num_of_usim = self.curr_num_of_usim + 1

            zhihuhot["feedsourcetag"] = feedsourcetag
            zhihuhot["feedsourceurl"] = feedsourceurl
            zhihuhot["userimgurl"] = userimgurl
            zhihuhot["username"] = username
            zhihuhot["userinfo"] = userinfo
            zhihuhot["newsimgurl"] = None
            zhihuhot["isvideo"] = isvideo
            zhihuhot["title"] = title
            zhihuhot["titleurl"] = titleurl
            zhihuhot["hotid"] = hotid
            zhihuhot["newscontent"] = newscontent
            zhihuhot["infavorqty"] = infavorqty
            zhihuhot["comment_url"] = None
            zhihuhot["comment_title"] = comment_title
            zhihuhot["share_url"] = None

            if hotid is not None:
                yield zhihuhot
                if hottype == 'question':
                    for i in range(comment_page):
                        yield Request(url=self.answer_comment_url.format(
                            hotid=hotid, offset=i * 20),
                                      meta={"hotid": hotid},
                                      cookies=cookie_dict,
                                      callback=self.parse_zhihuhot_comment)
                elif hottype == 'zhuanlan':
                    for i in range(comment_page):
                        yield Request(url=self.zhuanlan_comment_url.format(
                            hotid=hotid, offset=i * 20),
                                      meta={"hotid": hotid},
                                      cookies=cookie_dict,
                                      callback=self.parse_zhihuhot_comment)
                yield Request(url=titleurl,
                              meta={
                                  "hotid": hotid,
                                  "hottype": hottype
                              },
                              cookies=cookie_dict,
                              callback=self.parse_zhihuhot_content)

    def parse_zhihuhot_comment(self, response):
        hotid = response.meta.get("hotid", "")
        resultjson = json.loads(response.body)
        comments = resultjson['data']
        comment_item = ZhihuHotComment()
        for comment in comments:
            commentid = comment['id']
            author = comment['author']
            author_member = author['member']
            userimgsrcurl = author_member['avatar_url']
            url_token = author_member['url_token']
            userimgurl = '//www.zhihu.com/people/' + url_token
            username = author_member['name']
            replytime = comment['created_time']
            replytime = datetime.datetime.fromtimestamp(replytime)
            content = comment['content']
            infavorqty = comment['vote_count']
            replytouser = None
            replytouserurl = None
            if "reply_to_author" in comment.keys():
                reply_to_author = comment['reply_to_author']
                if reply_to_author is not None:
                    reply_to_author_member = reply_to_author['member']
                    replytouser = reply_to_author_member['name']
                    replytouser_urltoken = reply_to_author_member['url_token']
                    replytouserurl = '//www.zhihu.com/people/' + replytouser_urltoken
            file_name = "zhihuhotcomuser_%s.jpg" % self.curr_num_of_comuser
            file_path = os.path.join(
                "D:\StefanClub\StefanClub\www\static\img\zhihu", file_name)
            urllib.request.urlretrieve(userimgsrcurl, file_path)
            comment_item[
                "userimgsrcurl"] = "../static/img/zhihu/%s" % file_name
            comment_item["userimgnumber"] = self.curr_num_of_comuser
            self.curr_num_of_comuser = self.curr_num_of_comuser + 1

            comment_item["commentid"] = commentid
            comment_item["hotid"] = hotid
            comment_item["userimgurl"] = userimgurl
            comment_item["username"] = username
            comment_item["replytouser"] = replytouser
            comment_item["replytouserurl"] = replytouserurl
            comment_item["replytime"] = replytime
            comment_item["content"] = content
            comment_item["infavorqty"] = infavorqty

            yield comment_item

    def parse_zhihuhot_content(self, response):
        hotid = response.meta.get("hotid", "")
        partno = 1
        hottype = response.meta.get("hottype", "")
        zhihuhot_content = ZhihuHotContent()
        soup = BeautifulSoup(response.text, 'lxml')
        if hottype == 'question':
            post_node = soup.select("span[class='CopyrightRichText-richText']")
        else:
            post_node = soup.select("div[class='Post-RichText']")
        sel = Selector(
            text=str(post_node),
            type="html",
        )
        test = sel.xpath('*')
Exemplo n.º 8
0
class TaobaoSpider(scrapy.Spider):
    name = 'taobao'
    allowed_domains = ['www.taobao.com']
    start_urls = ['http://www.taobao.com/']
    keywords = ['UNIQLO', 'SUPERME', 'NIKE', 'ADIDAS', 'APPLE', 'HUAWEI']
    #keywords = ['UNIQLO', 'SUPERME', 'NIKE', 'ADIDAS']
    last_twelve_url = 'https://s.taobao.com/api?_ksTS=1537784279315_208&callback=jsonp209&ajax=true&m=customized&stats_click=search_radio_all:1&q={keyword}&s=36&imgfile=&initiative_id=staobaoz_20180924&bcoffset=0&js=1&ie=utf8&rn=91a38a1dc028b177e8b2f5d17a1f1e05'
    next_page_url = 'https://s.taobao.com/search?data-key=s&data-value={datavalue}&ajax=true&_ksTS=1537791664734_887&callback=jsonp888&q={keyword}&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20180920&ie=utf8&bcoffset=4&p4ppushleft=1%2C48'

    max_img_num = get_max_num('taobaoproduct')
    if max_img_num is None:
        max_img_num = 0
    curr_num_of_img = max_img_num + 1

    def start_requests(self):
        yield scrapy.Request('http://www.taobao.com/', callback=self.parse)

    def parse(self, response):
        for keyword in self.keywords:
            browser = webdriver.Chrome()
            browser.get('https://www.taobao.com/')
            browser.find_element_by_class_name(
                "search-combobox-input").send_keys(keyword)
            browser.find_element_by_class_name("btn-search").click()
            time.sleep(5)
            while browser.page_source.find('g_page_config') == -1:
                browser.refresh()
            page_source = browser.page_source
            browser.close()
            #yield self.parse_first_batch(page_source,keyword)
            g_page_config = page_source[
                page_source.index('g_page_config = {') +
                15:page_source.index('g_srp_loadCss()')].strip()
            g_page_config_json = json.loads(g_page_config[:-1])
            modsfirst = g_page_config_json["mods"]
            itemlist_first = modsfirst["itemlist"]
            data_first = itemlist_first["data"]
            auctions_first = data_first["auctions"]
            for auction_first in auctions_first:
                taobaoproduct = parse_taobao_products(auction_first, keyword,
                                                      self.curr_num_of_img)
                self.curr_num_of_img = self.curr_num_of_img + 1
                yield taobaoproduct

            last_twelve_response = requests.get(
                self.last_twelve_url.format(keyword=keyword))
            while last_twelve_response.text == '':
                last_twelve_response = requests.get(
                    self.last_twelve_url.format(keyword=keyword))
            #yield self.parse_last_twelve(last_twelve_response.text,keyword)
            dict_response_last = last_twelve_response.text[11:]
            dict_response_last = dict_response_last[:-2]
            json_response_last = json.loads(dict_response_last)
            customizedapi = json_response_last['API.CustomizedApi']
            itemlist_last = customizedapi['itemlist']
            auctions_last = itemlist_last['auctions']
            for auction_last in auctions_last:
                taobaoproduct = parse_taobao_products(auction_last, keyword,
                                                      self.curr_num_of_img)
                self.curr_num_of_img = self.curr_num_of_img + 1
                yield taobaoproduct

            # get next page by click the nextlink
            # browser.find_elements_by_partial_link_text('下一页')[0].click()
            # next_page_source = browser.page_source
            for i in range(30):
                nextpage_response = requests.get(
                    self.next_page_url.format(datavalue=44 * (i + 1),
                                              keyword=keyword))
                while nextpage_response.text == '':
                    nextpage_response = requests.get(
                        self.next_page_url.format(datavalue=44 * (i + 1),
                                                  keyword=keyword))
                #yield self.parse_next_page(nextpage_response.text,keyword)
                dict_response_next = nextpage_response.text[11:]
                dict_response_next = dict_response_next[:-2]
                json_response_next = json.loads(dict_response_next)
                modsnext = json_response_next['mods']
                itemlist_next = modsnext['itemlist']
                data_next = itemlist_next['data']
                auctions_next = data_next['auctions']
                for auction_next in auctions_next:
                    taobaoproduct = parse_taobao_products(
                        auction_next, keyword, self.curr_num_of_img)
                    self.curr_num_of_img = self.curr_num_of_img + 1
                    yield taobaoproduct

    def parse_first_batch(self, page_source, keyword):
        g_page_config = page_source[page_source.index('g_page_config = {') +
                                    15:page_source.index('g_srp_loadCss()'
                                                         )].strip()
        g_page_config_json = json.loads(g_page_config[:-1])
        mods = g_page_config_json["mods"]
        itemlist = mods["itemlist"]
        data = itemlist["data"]
        auctions = data["auctions"]
        for auction in auctions:
            taobaoproduct = parse_taobao_products(auction, keyword,
                                                  self.curr_num_of_img)
            self.curr_num_of_img = self.curr_num_of_img + 1
            yield taobaoproduct

    def parse_last_twelve(self, response, keyword):
        dict_response = response[11:]
        dict_response = dict_response[:-2]
        json_response = json.loads(dict_response)
        customizedapi = json_response['API.CustomizedApi']
        itemlist = customizedapi['itemlist']
        auctions = itemlist['auctions']
        for auction in auctions:
            taobaoproduct = parse_taobao_products(auction, keyword,
                                                  self.curr_num_of_img)
            self.curr_num_of_img = self.curr_num_of_img + 1
            yield taobaoproduct

    def parse_next_page(self, response, keyword):
        dict_response = response[11:]
        dict_response = dict_response[:-2]
        json_response = json.loads(dict_response)
        mods = json_response['mods']
        itemlist = mods['itemlist']
        data = itemlist['data']
        auctions = data['auctions']
        for auction in auctions:
            taobaoproduct = parse_taobao_products(auction, keyword,
                                                  self.curr_num_of_img)
            self.curr_num_of_img = self.curr_num_of_img + 1
            yield taobaoproduct
Exemplo n.º 9
0
class SinasportsSpider(scrapy.Spider):
    name = 'sinasports'
    allowed_domains = ['sports.sina.com.cn']
    start_urls = ['http://sports.sina.com.cn/']

    matcher_api_url = 'http://sports.sina.com.cn/iframe/js/2015/live.js?dpc=1'
    nbanews_url = '''http://cre.mix.sina.com.cn/get/cms/feed?callback=jQuery111304795819583663854_1535281251386
                  &pcProduct=31&ctime=&merge=3&mod=pcsptw&cre=tianyi&statics=1
                  &length=12&ad=%7B%22rotate_count%22%3A100%2C%22platform%22%3A%22pc%22%2C%22channel%22%3A%22
                  tianyi_pcspt%22%2C%22page_url%22%3A%22http%3A%2F%2Fsports.sina.com.cn%2F%22%2C%22
                  timestamp%22%3A1535281251422+%7D&_=1535281251395]'''
    intsoc_url = '''http://cre.mix.sina.com.cn/get/cms/feed?callback=jQuery111306164420163923745_1535782658772
                &pcProduct=30&ctime=&merge=3&mod=pcsptw&cre=tianyi&statics=1&length=12&ad=%7B%22rotate_count
                %22%3A100%2C%22platform%22%3A%22pc%22%2C%22channel%22%3A%22tianyi_pcspt%22%2C%22page_url%22
                %3A%22http%3A%2F%2Fsports.sina.com.cn%2F%22%2C%22timestamp%22%3A1535782658814+%7D&_=1535782658971'''
    chisoc_url = '''http://cre.mix.sina.com.cn/get/cms/feed?callback=jQuery111306164420163923745_1535782658772
                 &pcProduct=29&ctime=&merge=3&mod=pcsptw&cre=tianyi&statics=1&length=12&ad=%7B%22rotate_count
                 %22%3A100%2C%22platform%22%3A%22pc%22%2C%22channel%22%3A%22tianyi_pcspt%22%2C%22page_url%22%3A
                 %22http%3A%2F%2Fsports.sina.com.cn%2F%22%2C%22timestamp%22%3A1535782658814+%7D&_=1535782659005'''
    cba_url = '''http://cre.mix.sina.com.cn/get/cms/feed?callback=jQuery111306164420163923745_1535782658772&pcProduct=32
               &ctime=&merge=3&mod=pcsptw&cre=tianyi&statics=1&length=12&ad=%7B%22rotate_count%22%3A100%2C%22platform
               %22%3A%22pc%22%2C%22channel%22%3A%22tianyi_pcspt%22%2C%22page_url%22%3A%22http%3A%2F%2Fsports.sina.com.cn
               %2F%22%2C%22timestamp%22%3A1535782658814+%7D&_=1535782659010'''
    sum_url = '''http://cre.mix.sina.com.cn/get/cms/feed?callback=jQuery111306164420163923745_1535782658772&pcProduct=33
               &ctime=&merge=3&mod=pcsptw&cre=tianyi&statics=1&length=12&ad=%7B%22rotate_count%22%3A100%2C%22platform
               %22%3A%22pc%22%2C%22channel%22%3A%22tianyi_pcspt%22%2C%22page_url%22%3A%22http%3A%2F%2Fsports.sina.com.cn
               %2F%22%2C%22timestamp%22%3A1535782658814+%7D&_=1535782659015'''

    max_car_number = get_max_num('sinacar')
    if max_car_number is None:
        max_car_number = 0
    curr_num_of_car = max_car_number + 1

    max_hotmatnews_num = get_max_num('hotmatch_news')
    if max_hotmatnews_num is None:
        max_hotmatnews_num = 0
    curr_num_of_hmn = max_hotmatnews_num + 1

    max_nbanews_num = get_max_num('nbanews')
    if max_nbanews_num is None:
        max_nbanews_num = 0
    curr_num_of_nba = max_nbanews_num + 1

    max_lefttop_num = get_max_num('lefttop')
    if max_lefttop_num is None:
        max_lefttop_num = 0
    curr_num_of_letop = max_lefttop_num + 1

    max_leftsec_num = get_max_num('leftsec')
    if max_leftsec_num is None:
        max_leftsec_num = 0
    curr_num_of_lesec = max_leftsec_num + 1

    def start_requests(self):
        yield Request(self.nbanews_url,
                      meta={
                          "newstype": "NBA",
                          "request_url": self.nbanews_url
                      },
                      callback=self.parse_nba_news)
        yield Request(self.intsoc_url,
                      meta={
                          "newstype": "INTSOC",
                          "request_url": self.intsoc_url
                      },
                      callback=self.parse_nba_news)
        yield Request(self.chisoc_url,
                      meta={
                          "newstype": "CHISOC",
                          "request_url": self.chisoc_url
                      },
                      callback=self.parse_nba_news)
        yield Request(self.cba_url,
                      meta={
                          "newstype": "CBA",
                          "request_url": self.cba_url
                      },
                      callback=self.parse_nba_news)
        yield Request(self.sum_url,
                      meta={
                          "newstype": "SUM",
                          "request_url": self.sum_url
                      },
                      callback=self.parse_nba_news)
        for start_url in self.start_urls:
            yield Request(start_url, callback=self.parse_main)
        yield Request(self.matcher_api_url, callback=self.parse_matches)

    def parse_matches(self, response):
        hotmatches = HotMatches()
        html = response.text
        html = html[:-13]
        html = html[42:]
        resultjson = json.loads(html)
        matches = resultjson['matches']
        for match in matches:
            livecast_id = match['livecast_id']
            shorttitle = match['ShortTitle']
            round_cn = match['Round_cn']
            title = shorttitle + round_cn
            team1 = match['Team1']
            team2 = match['Team2']
            score1 = match['Score1']
            score2 = match['Score2']
            if not score1.strip() and not score2.strip():
                matchtype = 'pre'
            else:
                matchtype = 'post'
            matchdate = match['date']
            matchdate = matchdate[5:]
            matchtime = match['time']
            newsurl = match['NewsUrl']
            liveurl = match['LiveUrl']
            match_url = match['match_url']

            hotmatches['livecast_id'] = livecast_id
            hotmatches['type'] = matchtype
            hotmatches['title'] = title
            hotmatches['team1'] = team1
            hotmatches['team2'] = team2
            hotmatches['score1'] = score1
            hotmatches['score2'] = score2
            hotmatches['matchdate'] = matchdate
            hotmatches['matchtime'] = matchtime
            hotmatches['newsurl'] = newsurl
            hotmatches['liveurl'] = liveurl
            hotmatches['match_url'] = match_url

            if hotmatches['team1'] and hotmatches['team2']:
                yield hotmatches

    def parse_main(self, response):
        sinacarousel = SinaCarousel()
        hotmatchnews = HotMatchNews()
        soup = BeautifulSoup(response.text, 'lxml')
        post_nodes = soup.select(
            "ul[class='slide-focus-d-cont'] li[class='clearfix thumbnail-b-gra']"
        )
        post_nodes1 = soup.select("div[node-type='tytopwrap']")
        lefttopimg_node = soup.select(
            "div[data-sudaclick='blk_focusvideo'] div[class='thumbnail-b thumbnail-b-gra thumbnail-b-video']"
        )[0]
        post_nodes2 = soup.select(
            "div[data-sudaclick='blk_focusvideo'] div[class='layout-mt-g news-list-e'] p"
        )
        leftsecond_node = soup.select(
            "div[class='layout-mt-h layout-mb-e news-hot']")[0]

        lefttop_nbanews = parse_lefttop(lefttopimg_node)
        yield lefttop_nbanews
        lefttoplines_nbanewslist = parse_lefttoplines(post_nodes2)
        for i in range(0, len(lefttoplines_nbanewslist)):
            if lefttoplines_nbanewslist[i] is not None:
                yield lefttoplines_nbanewslist[i]
        leftsec_nbanews = parse_leftsecond(leftsecond_node)
        yield leftsec_nbanews
        leftsectxt_nbanewslist = parse_leftsectxt(leftsecond_node)
        for i in range(0, len(leftsectxt_nbanewslist)):
            yield leftsectxt_nbanewslist[i]

        for post_node in post_nodes:
            sel = Selector(
                text=str(post_node),
                type="html",
            )
            title = sel.xpath('//p/text()').extract()[0].strip()
            url = sel.xpath('//a//@href').extract()[0].strip()
            img_url = sel.xpath('//img//@src').extract()[0].strip()
            file_name = "carousel_%s.jpg" % self.curr_num_of_car
            file_path = os.path.join(
                "D:\StefanClub\StefanClub\www\static\img\sinasports",
                file_name)
            urllib.request.urlretrieve(img_url, file_path)

            sinacarousel["number"] = self.curr_num_of_car
            self.curr_num_of_car = self.curr_num_of_car + 1
            sinacarousel["title"] = title
            sinacarousel["url"] = url
            #carousel_item["img_url"] = img_url
            sinacarousel["img_url"] = "../static/img/sinasports/%s" % file_name
            yield sinacarousel

        for post_node1 in post_nodes1:
            sel1 = Selector(
                text=str(post_node1),
                type="html",
            )
            titles = sel1.xpath('//h3/a')
            title1 = ''
            title2 = ''
            title3 = ''
            title1url = ''
            title2url = ''
            title3url = ''
            if len(titles) == 3:
                title1 = sel1.xpath('//h3/a/text()').extract()[0].strip()
                title2 = sel1.xpath('//h3/a/text()').extract()[1].strip()
                title3 = sel1.xpath('//h3/a/text()').extract()[2].strip()
                title1url = sel1.xpath('//h3/a//@href').extract()[0].strip()
                title2url = sel1.xpath('//h3/a//@href').extract()[1].strip()
                title3url = sel1.xpath('//h3/a//@href').extract()[2].strip()
            elif len(titles) == 2:
                title1 = sel1.xpath('//h3/a/text()').extract()[0].strip()
                title2 = sel1.xpath('//h3/a/text()').extract()[1].strip()
                title3 = ''
                title1url = sel1.xpath('//h3/a//@href').extract()[0].strip()
                title2url = sel1.xpath('//h3/a//@href').extract()[1].strip()
                title3url = ''
            elif len(titles) == 1:
                title1 = sel1.xpath('//h3/a/text()').extract()[0].strip()
                title2 = ''
                title3 = ''
                title1url = sel1.xpath('//h3/a//@href').extract()[0].strip()
                title2url = ''
                title3url = ''
            else:
                pass
            imgurl = sel1.xpath('//div[@class="ty-card-thumb-w"]/a//@href'
                                ).extract()[0].strip()
            imgsrcurl = sel1.xpath('//img//@src').extract()[0].strip()
            imgsrcurl = 'http:' + imgsrcurl
            file_name1 = "hotmatchnews_%s.jpg" % self.curr_num_of_hmn
            file_path1 = os.path.join(
                "D:\StefanClub\StefanClub\www\static\img\sinasports",
                file_name1)
            urllib.request.urlretrieve(imgsrcurl, file_path1)

            li1 = sel1.xpath('//li').extract()[0].strip()
            li2 = sel1.xpath('//li').extract()[1].strip()
            li3 = sel1.xpath('//li').extract()[2].strip()
            subsel1 = Selector(
                text=str(li1),
                type="html",
            )
            subsel2 = Selector(
                text=str(li2),
                type="html",
            )
            subsel3 = Selector(
                text=str(li3),
                type="html",
            )
            lia1 = subsel1.xpath('//a')
            lia2 = subsel2.xpath('//a')
            lia3 = subsel3.xpath('//a')
            line1 = ''
            line2 = ''
            line3 = ''
            line4 = ''
            line5 = ''
            line6 = ''
            line7 = ''
            line8 = ''
            line9 = ''
            line1url = ''
            line2url = ''
            line3url = ''
            line4url = ''
            line5url = ''
            line6url = ''
            line7url = ''
            line8url = ''
            line9url = ''
            if len(lia1) == 3:
                line1 = subsel1.xpath('//a/text()').extract()[0].strip()
                line2 = subsel1.xpath('//a/text()').extract()[1].strip()
                line3 = subsel1.xpath('//a/text()').extract()[2].strip()
                line1url = subsel1.xpath('//a//@href').extract()[0].strip()
                line2url = subsel1.xpath('//a//@href').extract()[1].strip()
                line3url = subsel1.xpath('//a//@href').extract()[2].strip()
            elif len(lia1) == 2:
                line1 = subsel1.xpath('//a/text()').extract()[0].strip()
                line2 = subsel1.xpath('//a/text()').extract()[1].strip()
                line3 = ''
                line1url = subsel1.xpath('//a//@href').extract()[0].strip()
                line2url = subsel1.xpath('//a//@href').extract()[1].strip()
                line3url = ''
            elif len(lia1) == 1:
                line1 = subsel1.xpath('//a/text()').extract()[0].strip()
                line2 = ''
                line3 = ''
                line1url = subsel1.xpath('//a//@href').extract()[0].strip()
                line2url = ''
                line3url = ''
            else:
                pass

            if len(lia2) == 3:
                line4 = subsel2.xpath('//a/text()').extract()[0].strip()
                line5 = subsel2.xpath('//a/text()').extract()[1].strip()
                line6 = subsel2.xpath('//a/text()').extract()[2].strip()
                line4url = subsel2.xpath('//a//@href').extract()[0].strip()
                line5url = subsel2.xpath('//a//@href').extract()[1].strip()
                line6url = subsel2.xpath('//a//@href').extract()[2].strip()
            elif len(lia2) == 2:
                line4 = subsel2.xpath('//a/text()').extract()[0].strip()
                line5 = subsel2.xpath('//a/text()').extract()[1].strip()
                line6 = ''
                line4url = subsel2.xpath('//a//@href').extract()[0].strip()
                line5url = subsel2.xpath('//a//@href').extract()[1].strip()
                line6url = ''
            elif len(lia2) == 1:
                line4 = subsel2.xpath('//a/text()').extract()[0].strip()
                line5 = ''
                line6 = ''
                line4url = subsel2.xpath('//a//@href').extract()[0].strip()
                line5url = ''
                line6url = ''
            else:
                pass

            if len(lia3) == 3:
                line7 = subsel3.xpath('//a/text()').extract()[0].strip()
                line8 = subsel3.xpath('//a/text()').extract()[1].strip()
                line9 = subsel3.xpath('//a/text()').extract()[2].strip()
                line7url = subsel3.xpath('//a//@href').extract()[0].strip()
                line8url = subsel3.xpath('//a//@href').extract()[1].strip()
                line9url = subsel3.xpath('//a//@href').extract()[2].strip()
            elif len(lia3) == 2:
                line7 = subsel3.xpath('//a/text()').extract()[0].strip()
                line8 = subsel3.xpath('//a/text()').extract()[1].strip()
                line9 = ''
                line7url = subsel3.xpath('//a//@href').extract()[0].strip()
                line8url = subsel3.xpath('//a//@href').extract()[1].strip()
                line9url = ''
            elif len(lia3) == 1:
                line7 = subsel3.xpath('//a/text()').extract()[0].strip()
                line8 = ''
                line9 = ''
                line7url = subsel3.xpath('//a//@href').extract()[0].strip()
                line8url = ''
                line9url = ''
            else:
                pass

            hotmatchnews["number"] = self.curr_num_of_hmn
            self.curr_num_of_hmn = self.curr_num_of_hmn + 1
            hotmatchnews["title1"] = title1
            hotmatchnews["title2"] = title2
            hotmatchnews["title3"] = title3
            hotmatchnews["title1url"] = title1url
            hotmatchnews["title2url"] = title2url
            hotmatchnews["title3url"] = title3url
            hotmatchnews[
                "imgsrcurl"] = "../static/img/sinasports/%s" % file_name1
            hotmatchnews["imgurl"] = imgurl
            hotmatchnews["line1"] = line1
            hotmatchnews["line2"] = line2
            hotmatchnews["line3"] = line3
            hotmatchnews["line1url"] = line1url
            hotmatchnews["line2url"] = line2url
            hotmatchnews["line3url"] = line3url
            hotmatchnews["line4"] = line4
            hotmatchnews["line5"] = line5
            hotmatchnews["line6"] = line6
            hotmatchnews["line4url"] = line4url
            hotmatchnews["line5url"] = line5url
            hotmatchnews["line6url"] = line6url
            hotmatchnews["line7"] = line7
            hotmatchnews["line8"] = line8
            hotmatchnews["line9"] = line9
            hotmatchnews["line7url"] = line7url
            hotmatchnews["line8url"] = line8url
            hotmatchnews["line9url"] = line9url

            yield hotmatchnews

    def parse_nba_news(self, response):
        nbanews = NbaNews()
        newstype = response.meta.get("newstype", "")
        request_url = response.meta.get("request_url", "")
        html = response.text
        if html[0:6] == 'jQuery':
            html = html[:-3]
            html = html[43:]
        prejson = json.loads(html)
        if "data" in prejson.keys():
            nbanewslist = prejson['data']
            for nbanewsitem in nbanewslist:
                imgsrcurl = nbanewsitem["thumb"]
                imgurl = ''
                isvideo = 'FALSE'
                file_name = ''
                nbanews["number"] = -1
                if imgsrcurl:
                    imgurl = nbanewsitem["url"]
                    if imgurl is None:
                        imgurl = nbanewsitem["url_https"]
                    #if nbanewsitem.has_key('video_id'):
                    if "video_id" in nbanewsitem.keys():
                        video_id = nbanewsitem["video_id"]
                        if video_id is not None:
                            isvideo = 'TRUE'
                    file_name = "nbanews_%s.jpg" % self.curr_num_of_nba
                    file_path = os.path.join(
                        "D:\StefanClub\StefanClub\www\static\img\sinasports",
                        file_name)
                    urllib.request.urlretrieve(imgsrcurl, file_path)

                    nbanews["number"] = self.curr_num_of_nba
                    self.curr_num_of_nba = self.curr_num_of_nba + 1

                title = nbanewsitem["title"]
                titleurl = nbanewsitem["url"]
                if titleurl is None:
                    titleurl = nbanewsitem["url_https"]
                newstime = nbanewsitem["mtime"]
                newstime = datetime.datetime.fromtimestamp(newstime)
                comment_id = nbanewsitem["new_commentid"]
                channnel = comment_id[0:2]
                newsid = comment_id[3:-2]
                comment_url = "http://comment5.news.sina.com.cn/comment/skin/default.html?channel=" + channnel + "&newsid=" + newsid
                labels = nbanewsitem["labels"]
                labellist = [''] * 5
                if isinstance(labels, dict):
                    i = 0
                    for key in labels.keys():
                        labellist[i] = key
                        i += 1

                nbanews[
                    "imgsrcurl"] = "../static/img/sinasports/%s" % file_name
                nbanews["imgurl"] = imgurl
                nbanews["isvideo"] = isvideo
                nbanews["title"] = title
                nbanews["titleurl"] = titleurl
                nbanews["newstime"] = newstime
                nbanews["comment_url"] = comment_url
                jj = 1
                for j in labellist:
                    nbanews["tag%s" % jj] = j
                    nbanews["tag%surl" % jj] = "//tags.sports.sina.com.cn/" + j
                    jj += 1
                nbanews["newstype"] = newstype
                yield nbanews
        else:
            yield Request(request_url,
                          meta={
                              "newstype": newstype,
                              "request_url": request_url
                          },
                          callback=self.parse_nba_news)