Пример #1
0
	def parse(self,response):
		mysql = msyqlHelper()
		old = response.meta
		names = set(['上架感言!'])
		links = response.xpath(old['xpath'])
		
		j = 1
		for link in links:
			name = link.xpath('text()').extract_first();
			if name in names:
				continue;
			href = link.xpath('@href').extract_first();
			url = urljoin(response.url,href)
			names.add(name)
			meta = {}
			meta['name'] = name
			meta['bid'] = old['bid']
			meta['size'] = 0
			meta['is_vip'] = 1
			meta['prev_cid'] = 0
			meta['next_cid'] = 0
			meta['sequence'] = j
			j = j+1
			self.logger.info('Parse url is  %s', url)
			chapter_id = mysql.insert(meta);
			meta['chapter_id'] = chapter_id
			if old['other'] == True:
			
				meta['id'] = old['id']+href.replace('.html','')
			else:
				meta['id'] = old['id']
			self.logger.info('chapter_id is ------------------%s',chapter_id)
			yield scrapy.Request(url,callback=self.parse2,meta=meta)
		mysql.close();
Пример #2
0
    def parse(self,response):
    	mysql = msyqlHelper()
    	names = set(['上架感言!'])
    	links = response.xpath(response.meta['linkpath'])

    	j = 1
    	maxcid = 1
    	for link in links:
		    name = link.xpath('text()').extract_first()
		    if name in names:
		    	continue
		    href = link.xpath('@href').extract_first()
		    next_url = urljoin(response.url,href)
		    names.add(name)
		    meta = dict()
		    meta['name'] = name
		    meta['bid'] = response.meta['bid']
		    meta['size'] = 0
		    meta['is_vip'] = 1
		    if j == 1:
		    	meta['prev_cid'] = 0
		    else:
		    	meta['prev_cid'] = 	maxcid-1
		    meta['next_cid'] = maxcid+1
            
		    maxcid = maxcid+1
		    meta['sequence'] = j
		    j = j+1
		    self.logger.info('Parse url is  %s', next_url)
		    chapter_id = mysql.insert(meta)
		    meta['contentxpath'] = response.meta['contentxpath']
		    meta['id'] = chapter_id
		    self.logger.info('Parse function called on dfsdfsd------------------')
		    yield scrapy.Request(next_url,callback=self.parse_content,meta=meta)
    	mysql.close()
Пример #3
0
    def parse(self, response):
        mysql = msyqlHelper()
        data = dict()
        meta = dict()
        data['bid'] = response.meta['bid']
        data['size'] = 0
        data['is_vip'] = 0
        data['name'] = response.xpath(
            '//div[@id="htmltimu"]/h2/span/text()').extract_first()

        str = response.xpath('//div[@id="chapterContent"]/p/text()').extract()
        str = filter(lambda s: s != '', str)
        newsttr = list(str)
        content = '\r\n'.join(newsttr)
        data['content'] = content
        data['size'] = len(content)
        data['sequence'] = response.meta['sequence']
        data['prev_cid'] = 0
        data['next_cid'] = 0
        chapter_id = mysql.inseraAll(data)
        self.logger.info(data)
        mysql.close()
        if data['name'] == '第一千零九十章少年的奇怪行为':
            return
        href = response.xpath(
            '//a[contains(.//text(), "下一页")]/@href').extract_first()
        if href is None:
            return
        meta['bid'] = response.meta['bid']
        meta['sequence'] = response.meta['sequence'] + 1
        meta['last_name'] = response.meta['last_name']
        next_url = urljoin(response.url, href)
        yield scrapy.Request(next_url, callback=self.parse, meta=meta)
Пример #4
0
	def parse(self,response):
		mysql = msyqlHelper()
		old = response.meta
		names = set(['上架感言!'])
		links = response.xpath(old['xpath'])
		
		j = 1
		for link in links:
			name = link.xpath('text()').extract_first();
			if name in names:
				continue;
			href = link.xpath('@href').extract_first();
			url = urljoin(response.url,href)
			names.add(name)
			meta = {}
			meta['name'] = name
			meta['bid'] = old['bid']
			meta['size'] = 0
			meta['is_vip'] = 1
			meta['prev_cid'] = 0
			meta['next_cid'] = 0
			meta['sequence'] = j
			meta['contentxpath'] = old['contentxpath']
			j = j+1
			self.logger.info('Parse url is  %s', url)
			chapter_id = mysql.insert(meta);
			#meta['id'] = old['id']
			meta['id'] = chapter_id
			self.logger.info('Parse function called on dfsdfsd------------------')
			yield scrapy.Request(url,callback=self.parse2,meta=meta)
		mysql.close();
Пример #5
0
 def parse(self, response):
     mysql = msyqlHelper()
     #txt = json.loads(response.body)
     #self.logger.info(txt)
     #return;
     chapters = response.xpath('//ul[@class="t-list"]/li[position()>86]/a')
     i = 300
     cookies = {
         "Hm_lvt_b7a5349c0dc4d90da89e89cc58ee99da": 1523950381,
         "UserItem_HGREAD_7040":
         "%7b%22UserId%22%3a2699411%2c%22OpenId%22%3a%22951678%22%2c%22AccessToken%22%3a%229E8E5C106BF94A14BE3760E3A5D1483F%22%2c%22RefreshToken%22%3a%2255DB88B286834E26A4DC66776E770537%22%2c%22ExpiresIn%22%3a1200%2c%22NickName%22%3a%22jinmincc%22%2c%22QQNo%22%3a%22%22%2c%22EMail%22%3a%22%22%2c%22Gender%22%3anull%2c%22IntroSelf%22%3a%22%22%2c%22HeadImgUrl%22%3a%22%22%2c%22UDate%22%3a%22%5c%2fDate(-62135596800000)%5c%2f%22%2c%22NewGuid%22%3anull%2c%22SumAmount%22%3a978%2c%22GiveSumAmount%22%3a0%2c%22GiveLoseTime%22%3a%22%5c%2fDate(-62135596800000)%5c%2f%22%2c%22VipScore%22%3a0%2c%22VipLoseScore%22%3a0%2c%22VipGrowth%22%3a0%2c%22VipLoseTime%22%3a%22%5c%2fDate(-62135596800000)%5c%2f%22%2c%22SVipLoseTime%22%3a%22%5c%2fDate(-62135596800000)%5c%2f%22%2c%22IsVip%22%3afalse%2c%22IsSVip%22%3afalse%2c%22VipTag%22%3a%22VIP%22%2c%22ScoreLevel%22%3a0%2c%22GrowthLevel%22%3a0%2c%22VipDiscount%22%3a1.000%2c%22TpOpenId%22%3anull%2c%22TpExpiresTime%22%3a0%7d",
         "USER_INFO_ACCESS_TOKEN":
         "EcXxcJEdDjsPKbanpdVAApnGhUOfe2RFwP9PRlNa2owtfVn9CvkMDQ==",
         "USER_INFO_EXPIRES_IN": "BSQWOyR09yo=",
         "USER_INFO_REFRESH_TOKEN":
         "RzYHBqus6gbGAgWeBK9D2LGd0IDtcKpO5/2zUwFRI2daAc3oHqA1ww==",
         "Hm_lpvt_b7a5349c0dc4d90da89e89cc58ee99da": 1523950925
     }
     #(`bid`, `name`,`sequence`,`size`,`is_vip`,`prev_cid`,`next_cid`,`recent_update_at`,`created_at`,`updated_at`)
     for chapter in chapters:
         link_info = chapter.xpath('@href').extract_first()
         name = chapter.xpath('text()').extract_first()
         link = link_info.split('/')
         #self.logger.info(link)
         headers = {
             "Referer":
             response.url,
             "Host":
             "www.hgread.com",
             "User-Agent":
             "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"
         }
         meta = {}
         meta['bid'] = 45
         meta['name'] = name
         meta['sequence'] = i
         meta['size'] = 0
         meta['is_vip'] = 0
         meta['prev_cid'] = 0
         meta['next_cid'] = 0
         chapter_id = mysql.insert(meta)
         meta['id'] = chapter_id
         i = i + 1
         if i >= 500:
             break
         url_t = 'http://www.hgread.com/home/OBookApiAgent?action=Chapter&bookId=%s&chapterId=%s' % (
             link[2], link[3][0:-5])
         #self.logger.info(meta)
         #url_format = 'http://m.iyunyue.com/inter/ChapterService.aspx?cmd=getchaptercontent&from=&iswx=0&bookid=413088&chapterid=%s' % (chapter['chapterId'],)
         yield scrapy.Request(url_t,
                              callback=self.parse_content,
                              meta=meta,
                              cookies=cookies,
                              headers=headers)
     mysql.close()
Пример #6
0
 def start_requests(self):
     msyql = msyqlHelper()
     for url in self.start_urls:
         book_name = url[0]
         link = url[1]
         bid = msyql.insertbook(book_name)
         meta = {}
         meta['bid'] = bid
         meta['xpath'] = url[2]
         meta['id'] = url[3]
         yield scrapy.Request(link, callback=self.parse, meta=meta)
     msyql.close()
Пример #7
0
    def start_requests(self):
        msyql = msyqlHelper()
        for url in self.start_urls:

            link = url[1]
            bid = url[0]
            meta = {}
            meta['bid'] = bid
            meta['xpath'] = url[2]
            meta['id'] = url[3]
            #self.logger.info(' 111111111111111111111111111111111111111 ------------------')
            yield scrapy.Request(link, callback=self.parse, meta=meta)
        msyql.close()
Пример #8
0
    def parse(self, response):

        mysql = msyqlHelper()
        names = set(['上架感言!'])
        links = response.xpath(response.meta['linkpath'])
        #self.logger.info('dddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd')
        #self.logger.info(links)
        j = 1
        maxcid = 1
        for link in links:
            name = link.xpath('text()').extract_first()
            if name in names:
                continue
            href = link.xpath('@href').extract_first()
            next_url = urljoin(response.url, href)
            #names.add(name)
            meta = dict()
            meta['name'] = name
            meta['bid'] = response.meta['bid']
            meta['size'] = 0
            meta['is_vip'] = 1
            if j == 1:
                meta['prev_cid'] = 0
            else:
                meta['prev_cid'] = 0
            meta['next_cid'] = 0

            maxcid = maxcid + 1
            meta['sequence'] = j
            j = j + 1
            self.logger.info('Parse url is  %s', next_url)
            chapter_id = mysql.insert(meta)
            meta['contentxpath'] = response.meta['contentxpath']
            meta['id'] = chapter_id
            self.logger.info('next url is %s------------------' % next_url)
            headers = {
                "Referer":
                response.url,
                "Host":
                "www.vodtw.com",
                "User-Agent":
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"
            }
            yield scrapy.Request(next_url,
                                 callback=self.parse_content,
                                 meta=meta,
                                 headers=headers)
        mysql.close()
Пример #9
0
    def parse(self, response):
        mysql = msyqlHelper()
        old = response.meta
        names = set(['上架感言!'])
        links = response.xpath(old['xpath'])
        self.logger.info(response.status)
        #self.logger.info('111 111 11 11 111 url------------------%s',response.url)
        j = 1
        for link in links:
            name = link.xpath('text()').extract_first()
            if name in names:
                continue
            href = link.xpath('@href').extract_first()
            url = urljoin(response.url, href)
            #name = name.strip()
            if name == '第四百三十五章 自作孽不可活 4':
                break
            names.add(name)
            meta = {}
            meta['name'] = name
            meta['bid'] = old['bid']
            meta['size'] = 0
            meta['is_vip'] = 1
            meta['prev_cid'] = 0
            meta['next_cid'] = 0
            meta['sequence'] = j
            j = j + 1

            self.logger.info('Parse url is  %s', url)
            #chapter_id = mysql.insert(meta);
            #meta['chapter_id'] = chapter_id
            #meta['id'] = old['id']+href.replace('.html','')
            meta['id'] = old['id']
            self.logger.info('bid is ------------------%s', meta['bid'])
            self.logger.info('name is -----name-------------%s', name)
            yield scrapy.Request(url, callback=self.parse2, meta=meta)
        mysql.close()
Пример #10
0
    def parse(self, response):
        mysql = msyqlHelper()
        names = set(['上架感言!'])
        links = response.xpath(response.meta['linkpath'])

        self.logger.info(links)
        #3cookies = response.headers.getlist('Set-Cookie')
        cookies = {
            "UM_distinctid":
            "162a3e6a64e89-0843b576687bb1-2e06372c-51000-162a3e6a6511a9",
            "uvip": "faaf79da50e39d893598fd8fce28cc04",
            "wgid": "1",
            "tlid": "223",
            "qdi": "1395",
            "qp": "30025",
            "id": "495059",
            "name": "user495059",
            "names": "%22user495059%22",
            "contact": "%22%5Cu91d1%5Cu5999%5Cu5999%22",
            "pic": "495059.jpg",
            "v": "1",
            "code": "9c960aa1d4b92cfa9569f26edc2cf2aa",
            "phone_unbind": "1",
            "tuid": "30025",
            "PHPSESSID": "gaa04b81clg2qj6n3qoeavmp52",
            "pindao": "b",
            "bi": "204",
            "CNZZDATA1267452641": "772678074-1523166968-%7C1524820323",
            "Hm_lvt_589e8b9ebda178159870e84dcda2b999": "1524801203",
            "Hm_lpvt_589e8b9ebda178159870e84dcda2b999": "1524821433"
        }
        #headers = {
        #"Referer":"http://www.sxyj.net/Book_Read/bookId_4dc9650165c6405f9219947466176978/chapterId_465531889b8648c0a26ec775eeda2056.html"
        #}
        j = 1
        #meta = dict()
        #meta['contentxpath'] = response.meta['contentxpath']
        #yield scrapy.Request("http://www.sxyj.net/WebApi/Book/GetChapter?bookId=4dc9650165c6405f9219947466176978&chapterId=465531889b8648c0a26ec775eeda2056",callback=self.parse_content,meta=meta,cookies=cookies,headers=headers)
        #return;
        maxcid = 1
        for link in links:
            name = link.xpath('text()').extract_first()
            if name in names:
                continue

            href = link.xpath('@href').extract_first()
            hrefArr = href.split('/')
            #BookId = hrefArr[2][7:]
            #ChapterIds = hrefArr[3][10:-5]
            next_url = urljoin(response.url, href)
            self.logger.info(name)
            meta = dict()
            meta['name'] = name
            #chapter = mysql.getByBidAndName(name,18)
            #content = chapter.get('content')
            #self.logger.info(chapter);
            #if content != '':
            #   continue

            #self.logger.info("--------name is %s" % name)
            meta['bid'] = response.meta['bid']
            meta['size'] = 0
            meta['is_vip'] = 1
            if j == 1:
                meta['prev_cid'] = 0
            else:
                meta['prev_cid'] = 0
            meta['next_cid'] = 0

            self.logger.info(
                "-------enter-222-------------enter---2222-----------enter---222----"
            )
            maxcid = maxcid + 1
            meta['sequence'] = j

            j = j + 1
            self.logger.info('----name-is:%s---url-is:%s' % (name, next_url))

            headers = {
                "Referer":
                response.url,
                "Host":
                "m.bsread.com",
                "User-Agent":
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
            }
            self.logger.info('Parse url is  %s', next_url)
            if j <= 11:
                continue
            chapter_id = mysql.insert(meta)
            #chapter_id = chapter.get('id')
            meta['contentxpath'] = response.meta['contentxpath']
            meta['id'] = chapter_id
            self.logger.info(
                'Parse function called on dfsdfsd------------------')
            #temp = "http://www.sxyj.net/WebApi/Book/GetChapter?bookId=%s&chapterId=%s" % (BookId,ChapterIds)
            #yield scrapy.Request(next_url,callback=self.parse_content,meta=meta,cookies=cookies,headers=headers)
            aid = href.split('/')[2][0:-5]

            formdata = {"act": "gView", "bid": '224', "aid": aid}
            self.logger.info(formdata)
            yield scrapy.FormRequest(url='https://m.bsread.com/wx/s.php',
                                     formdata=formdata,
                                     headers=headers,
                                     callback=self.parse_content2,
                                     cookies=cookies,
                                     meta=meta)
        mysql.close()
Пример #11
0
    def parse(self, response):
        mysql = msyqlHelper()
        names = set()
        links = response.xpath(response.meta['linkpath'])

        self.logger.info(links)
        #3cookies = response.headers.getlist('Set-Cookie')
        cookies = {
            "jieqiVisitInfo":
            "jieqiUserLogin%3D1525764839%2CjieqiUserId%3D123189",
            "read_pagenum": "1",
            "jieqiWapPsize": "-11",
            "shuhai_history_":
            "%5B%7B%22aid%22%3A%2211817%22%2C%22cid%22%3A1230652%2C%22aname%22%3A%22%25CE%25D2%25C4%25C3%25CA%25B1%25B9%25E2%25BB%25BB%25C4%25E3%25D2%25BB%25CA%25C0%25B3%25D5%25C3%25D4%22%2C%22autname%22%3A%22%25CA%25A2%25C9%25D9%22%2C%22asort%22%3A%22%25CF%25D6%25B4%25FA%25D1%25D4%25C7%25E9%22%2C%22cname%22%3A%22%2B%25B5%25DA5%25D5%25C2%2B%25D7%25ED%25BE%25C6%25B6%25D4%25BF%25B9%22%2C%22siteid%22%3Anull%2C%22sortid%22%3A%22111%22%7D%2C%7B%22aid%22%3A%2213540%22%2C%22cid%22%3A2053135%2C%22aname%22%3A%22%25CE%25AA%25C4%25E3%25C4%25A8%25C8%25A5%25D2%25BB%25CA%25C0%25B3%25BE%25B0%25A3%22%2C%22autname%22%3A%22%25BE%25FD%25D6%25B9%25B9%25E9%22%2C%22asort%22%3A%22%25C7%25E0%25B4%25BA%25D0%25A3%25D4%25B0%22%2C%22cname%22%3A%22%2B%25B5%25DA050%25D5%25C2%2526nbsp%253B%2526nbsp%253B%25CB%25AF%25BE%25F5%25CA%25C7%25B8%25F6%25CE%25CA%25CC%25E2%22%2C%22siteid%22%3Anull%2C%22sortid%22%3A%22101%22%7D%5D",
            "PHPSESSID": "1ff197c95d71a9d38021cdf0ccff1508"
        }
        #headers = {
        #"Referer":"http://www.sxyj.net/Book_Read/bookId_4dc9650165c6405f9219947466176978/chapterId_465531889b8648c0a26ec775eeda2056.html"
        #}
        j = 317
        #meta = dict()
        #meta['contentxpath'] = response.meta['contentxpath']
        #yield scrapy.Request("http://www.sxyj.net/WebApi/Book/GetChapter?bookId=4dc9650165c6405f9219947466176978&chapterId=465531889b8648c0a26ec775eeda2056",callback=self.parse_content,meta=meta,cookies=cookies,headers=headers)
        #return;
        maxcid = 1
        for link in links:
            name = link.xpath('text()').extract_first()

            href = link.xpath('@href').extract_first()
            hrefArr = href.split('/')
            #BookId = hrefArr[2][7:]
            #ChapterIds = hrefArr[3][10:-5]
            next_url = urljoin(response.url, href)
            self.logger.info(name)
            meta = dict()
            meta['name'] = name

            meta['bid'] = response.meta['bid']
            meta['size'] = 0
            meta['is_vip'] = 1
            if j == 1:
                meta['prev_cid'] = 0
            else:
                meta['prev_cid'] = 0
            meta['next_cid'] = 0

            self.logger.info(
                "-------enter-222-------------enter---2222-----------enter---222----"
            )
            maxcid = maxcid + 1
            meta['sequence'] = j

            j = j + 1
            self.logger.info('----name-is:%s---url-is:%s' % (name, next_url))

            headers = {
                "Referer":
                response.url,
                "Host":
                "yomeng.yunshuge.com",
                "User-Agent":
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
            }
            self.logger.info('Parse url is  %s', next_url)

            chapter_id = mysql.insert(meta)
            #chapter_id = chapter.get('id')
            meta['contentxpath'] = response.meta['contentxpath']
            meta['id'] = chapter_id
            self.logger.info(
                'Parse function called on dfsdfsd------------------')
            #temp = "http://www.sxyj.net/WebApi/Book/GetChapter?bookId=%s&chapterId=%s" % (BookId,ChapterIds)
            yield scrapy.Request(next_url,
                                 callback=self.parse_content,
                                 meta=meta,
                                 cookies=cookies,
                                 headers=headers)
            #aid = href.split('/')[2][0:-5]
            break
            #formdata={"act":"gView","bid":'224',"aid":aid}
            #self.logger.info(formdata)
            #yield scrapy.FormRequest(url='https://m.bsread.com/wx/s.php',formdata=formdata,headers=headers,callback=self.parse_content2,cookies=cookies,meta=meta)
        mysql.close()