Пример #1
0
 def parse_video(self, response):
     sel = Selector(response)
     meta = response.request.meta
     self.log('The meta: %r' % meta)
     cur_path = meta['cur_path']
     cur_topic_idx = meta['cur_topic_idx']
     cur_level_idx = meta['cur_level_idx']
     cur_video_idx = meta['cur_video_idx']
     cur_video = self.topics[cur_topic_idx]['levels'][cur_level_idx]['videos'][cur_video_idx]
     self.log('Current video: %r' % cur_video)
     
     descr = sel.css('.g9.mb10 p').xpath('text()').extract()[0]
     if descr:
         cur_video['descr'] = descr.strip()
         self.log('Got descr > [%s]' % descr)
     else:
         self.log('Descr failed >>>>>>>> [%s]' % response.url)
     
     youtube_key = sel.css('.g9.lesson-video.ic iframe').xpath('@src').re('http://.*youtube.com/embed/([^?]*)\?')[0]
     if youtube_key:
         cur_video['youtube_key'] = youtube_key
         os.system('cd /tmp/videos && proxychains youtube-dl http://www.youtube.com/watch\?v\=%s --write-sub --all-subs --write-auto-sub' % youtube_key)
         os.system('mv /tmp/videos/*%s* %s' % (youtube_key, cur_path))
         self.log('Got youtube key > %s' % youtube_key)
     else:
         self.log('Youtube key failed >>>>>>>> [%s]' % response.url)
 def parsePost(self, response):
     logging.info(response)
     sel = Selector(response)
     posts = sel.css("Table.PostBox")
     breadcrumbs = sel.css("#Breadcrumbs")
     # condition = breadcrumbs.xpath("./a[3]/text()")
     condition = breadcrumbs.xpath("./a[3]/text()").extract()[0].lower()
     items = []
     topic = response.xpath('//div[contains(@id,"PageTitle")]/h1/text()').extract()[0]
     url = response.url
     for post in posts:
         item = PostItemsList()
         item["author"] = post.css(".msgUser").xpath("./a[2]").xpath("text()").extract()[0]
         item["author_link"] = post.css(".msgUser").xpath("./a[2]/@href").extract()[0]
         item["condition"] = condition.lower()
         item["create_date"] = self.getDate(
             re.sub(
                 " +|\n|\r|\t|\0|\x0b|\xa0",
                 " ",
                 response.css("td.msgThreadInfo").xpath("text()").extract()[0].replace("Posted ", ""),
             )
             .strip()
             .lower()
         )
         item["domain"] = "".join(self.allowed_domains)
         post_msg = self.cleanText(post.css(".PostMessageBody").extract()[0])
         item["post"] = post_msg
         # item['tag'] = ''
         item["topic"] = topic
         item["url"] = url
         items.append(item)
     return items
Пример #3
0
        def leer(self, response):           
            sel = Selector(response)           
            con_titulo = sel.css('div.product-info')
            con_ficha = sel.css('ul#product-details')
            con_disp = sel.css('span#disponibilidad_entrega')
            con_precio = sel.css('div#product-buy-small')

            titulo = con_titulo.xpath('.//span/text()')[0].extract()
            autor = con_titulo.xpath('.//a/text()')[0].extract()
            editorial = con_ficha.xpath('.//li')[0].xpath('.//a/text()')[0].extract()
            isbn = con_ficha.xpath('.//li')[3].xpath('.//span/text()')[1].extract()
            paginas = con_ficha.xpath('.//li')[4].xpath('.//span/text()')[1].extract()
            disponibilidad = con_disp.xpath('.//span/text()')[1].extract()
            precio = con_precio.xpath('.//p/text()')[0].extract()

            item = BuscadorItem()
            item['ISBN'] = isbn
            item['titulo'] = titulo
            item['autor'] = autor
            item['num_pag'] = paginas
            item['editorial'] = editorial
            item['precio'] = precio
            item['disponibilidad'] = disponibilidad

            #self.escribe_temp(isbn)

            return item            
Пример #4
0
    def parse_item(self, response):
        rand = random.randint(1, 2)
        time.sleep(rand)
        items = []
        sel = Selector(response)
        base_url = get_base_url(response)
        title_root_path = sel.css('.mainTitle')
        phone_root_path = sel.css('.l_phone')
        if title_root_path is None or len(title_root_path) == 0 or phone_root_path is None or len(phone_root_path) == 0:
            return items

        title_path = title_root_path.xpath('./h1/text()')
        phone_path = phone_root_path.xpath('text()')
        if title_path is not None and phone_path is not None:
            info_name = title_path
            item = AladdinItem()
            info_name_extract = info_name.extract()
            if info_name_extract is not None and len(info_name_extract) > 0:
                name = info_name_extract[0]
                item['name'] = name

            phone_extract = phone_path.extract()
            if phone_extract is not None and len(phone_extract) > 0:
                phone = phone_extract[0]
                phone_set = set()
                phone_set.add(phone)
                item['phone'] = phone_set

            if item.get('name') is not None and item.get('phone') is not None and len(set(item.get('phone'))) > 0:
                items.append(item)
        info(str(response))
        return items
Пример #5
0
    def extract_info_from_post(self, post):
        """
        Determines if the specified craigslist link has contact info. This is
        done based of the fact that there is a hyperlink created in the html
        page with the class showcontact when a page is hiding user contact info
        We are not interested in the users number, we only want to make sure
        that the listing is associated with a phone number
        """

        baseurl = MutableURL(post)
        # Get link content & build response object from url content
        body = requests.get(post)
        response = HtmlResponse(url=post, body=body.content)
        body.connection.close()
        # Build selector from response
        selector = Selector(response=response)
        # Extract the price from the link
        price = selector.css('span[class="price"]').xpath('text()').re(
            '[$]\d+')
        # Create the response
        post = {'link': post}
        # Attach the link that contains the full content of the page
        post['contact_info_link'] = selector.css(
            'a[class="showcontact"]::attr(href)').extract_first()
        # Expand the link
        post['contact_info_link'] = self.base_url.joinurl(
            post['contact_info_link']) if post['contact_info_link'] else None
        post['price'] = int(price[0][1:]) if price else None
        return post
Пример #6
0
    def parse_celebrity(self, response):
        """
        爬取艺人
        """
        celebrity = CelebrityItem()
        sel = Selector(response)

        celebrity["id"] = self._parse_id(response.url)
        name = sel.css("div.per_header h2::text").extract()
        celebrity["name"] = name[0] if name else ""
        name_en = sel.css("div.per_header p.enname::text").extract()
        celebrity["name_en"] = name_en[0] if name_en else ""

        yield Request(
            url=urljoin(response.url, "details.html"),
            callback=self.parse_celebrity_detail,
            meta={"celebrity": celebrity.copy()}
        )
        yield Request(
            url=urljoin(response.url, "awards.html"),
            callback=self.parse_celebrity_awards,
            meta={"celebrity": celebrity.copy()}
        )

        yield celebrity
Пример #7
0
    def search_parse(self, response):
        sel = Selector(response)

        print "myparam is %d" % (response.request.doubanid)
        title = sel.css("title")[0].xpath("./text()")[0].extract().strip()
        print "title is " + title

        photo = sel.css("a.nbgnbg")[0]
        imgurl = photo.xpath("./img/@src")[0].extract()
        arr1 = imgurl.split("/")
        print "img is " + arr1[len(arr1) - 1]

        self.moviedb.updMovie_doubanmovie(response.request.doubanid, title, arr1[len(arr1) - 1])

        arrinfo = sel.css("div#info")
        for curinfo in arrinfo:
            print "info is " + curinfo.extract()
            bi = curinfo.extract().find(u">又名:</span>")
            if bi > 0:
                tstr = curinfo.extract()[bi + len(u">又名:</span>") :]
                ei = tstr.find("<br>")
                tsrt1 = tstr[0:ei].strip()
                print "other name is " + tsrt1
                tarr1 = tsrt1.split("/")
                for t1 in tarr1:
                    t1 = t1.strip()
                    print "t1 is " + t1
                    self.moviedb.addMovieName_doubanmovie(response.request.doubanid, t1)

            break

        return []
Пример #8
0
    def parse(self, response):
        sel = Selector(response)
        list_sel = sel.css('div[id*=wrapper] div[id*=left] div[id*=content_box] div.content div.title a::attr(href)').extract()
        for list in range(len(list_sel)):
            print (list_sel[list])
            if list_sel[list][0] == 'h':
                inpage = list_sel[list]
            else:
                inpage = 'http://www.ruyig.com/' + list_sel[list]

            yield Request(inpage,callback=self.main_parse)

        print ('------------- next page ---------------')
        next_url = sel.css('div[id*=wrapper] div[id*=left] div[id*=content_box] div[id*=page_num] a::attr(href)').extract()[-1]

        if next_url[0] == 'h':
            next_link = next_url
        else:
            next_link = "http://www.ruyig.com" + next_url

        print(next_link)
        if next_link:
            self.page_count += 1
            print("*" * 30)
            print(self.page_count)
            print("*" * 30)
            yield Request(next_link, callback=self.parse)
Пример #9
0
def parse(doc):
    s = Selector(text=doc)

    title = s.css(".fullText h4::text").extract()
    title = title[0] if len(title) == 1 else ''
    tmp = s.css(".fullText .annexInfo span::text").extract()
    rd = re.compile('\d+')
    if len(tmp) == 2:
        time, symble = tmp
    elif len(tmp) == 1:
        if rd.match(tmp[0]):
            time = tmp[0]
            symble = '--'
        else:
            time = '--'
            symble = tmp[0]
    else:
        time, symble = '--', '--'
    court = s.css(".fullText .annexInfo a::text").extract()
    court = court[0] if len(court) == 1 else ''
    content = ''.join(s.css(".fullText .fullCon::text").extract())
    return '<%s,%s,%s,%s>\n\n%s' % (
        title.encode('utf8'),
        time.encode('utf8'),
        symble.encode('utf8'),
        court.encode('utf8'),
        content.encode('utf8'))
Пример #10
0
def situ(imgurl):
    url = 'http://image.baidu.com/n/pc_search'
    params = {
        'rn':'10',
        'appid':'4',
        'tag':'1',
        'isMobile':'0',
        'queryImageUrl':imgurl,
        'querySign':'',
        'fromProduct':'',
        'productBackUrl':'',
        'fm':'chrome',
        'uptype':'plug_in'
    }
    Headers['User-Agent'] = UA.chrome
    z = requests.get(url,params=params,headers=Headers)
    response = Selector(text=z.content)
    # 关键词描述
    kw = response.css('.guess-info-word-highlight::text').extract_first()
    #百度百科名字
    bk = response.css('.guess-newbaike-name::text').extract_first()
    # 图片来源标题
    img_title = response.css('.source-card-topic-title-link::text').extract()
    #图片来源描述
    img_content = response.css('.source-card-topic-content::text').extract()
    tr4s = TextRank4Sentence()
    tr4s.analyze(text=''.join(img_title), lower=True, source = 'all_filters')
    for item in tr4s.get_key_sentences(num=3):
        print(item.index, item.weight, item.sentence)
Пример #11
0
    def _parse_home_page(self, response, item=None):
        sel = Selector(response)

        if self._check_blog_created(sel):
            return

        if not item:
            item = BlogerItem()

        if self._check_private(sel):
            item['url'] = sel.css('div.avt > a::attr(href)').extract()[0]
            item['uid'] = re.search(urlpatterns['home'], item['url']).group(1)
            item['forbidden'] = True
            item['liveness'] = \
                sel.css(u'ul.bbda li:contains(活跃度)::text').re('\d+')[0]
            item['reputation'] = \
                sel.css(u'ul.bbda li:contains(威望)::text').re('\d+')[0]
        else:
            item['url'] = sel.css('#nv > ul > li:first-child a::attr(href)').extract()[0]
            item['uid'] = re.search(urlpatterns['home'], item['url']).group(1)

            ul = sel.css('#statistic_content ul li')
            if len(ul) > 0:
                item['reputation'] = ul.css(u'li:contains(威望) a::text').extract()
                if len(item['reputation']):
                    item['reputation'] = item['reputation'][0]

                item['liveness'] = ul.css(u'li:contains(活跃度) a::text').extract()
                if len(item['liveness']):
                    item['liveness'] = item['liveness'][0]
        return item
Пример #12
0
    def parse_item(self, response):
		items=[]
		sel = Selector(response)
		base_url=get_base_url(response)
		sites_even =sel.css('table.tablelist tr.even')
		for site in sites_even:
			item = JobItem()
			item['name']=site.css('.l.square a').xpath('text()').extract()
			relative_url=site.css('.l.square a').xpath('@href').extract()
			item['detailLink']=urljoin_rfc(base_url,relative_url)
			item['catalog']=site.css('tr > td:nth-child(2)::text').extract()
			item['workLocation']=site.css('tr > td:nth-child(3)::text').extract()
			item['recruitNumber']=site.css('tr > td:nth-child(4)::text').extract()
			item['publishTime']=site.css('tr > td:nth-child(5)::text').extract()
			items.append(item)

		sites_odd=sel.css('table.tablelist tr.odd')
		for site in sites_odd:
			item = JobItem()
			item['name']=site.css('.l.square a').xpath('text()').extract()
			relative_url=site.css('.l.square a').xpath('@href').extract()
			item['detailLink']=urljoin_rfc(base_url,relative_url)
			item['catalog']=site.css('tr > td:nth-child(2)::text').extract()
			item['workLocation']=site.css('tr > td:nth-child(3)::text').extract()
			item['recruitNumber']=site.css('tr > td:nth-child(4)::text').extract()
			item['publishTime']=site.css('tr > td:nth-child(5)::text').extract()
			items.append(item)

		info('parsed'+str(response))
		return items
Пример #13
0
   def parse_item(self, response): # 提取数据到Items里面,主要用到XPath和CSS选择器提取网页数据  
       items = []  
       sel = Selector(response)  
       base_url = get_base_url(response)  
       sites_even = sel.css('table.tablelist tr.even')  
       for site in sites_even:  
           item = TencentItem()  
           item['name'] = site.css('.l.square a').xpath('text()').extract()  
           relative_url = site.css('.l.square a').xpath('@href').extract()[0]  
           item['detailLink'] = urljoin_rfc(base_url, relative_url)  
           item['catalog'] = site.css('tr > td:nth-child(2)::text').extract()  
           item['workLocation'] = site.css('tr > td:nth-child(4)::text').extract()  
           item['recruitNumber'] = site.css('tr > td:nth-child(3)::text').extract()  
           item['publishTime'] = site.css('tr > td:nth-child(5)::text').extract()  
           items.append(item)  
           #print repr(item).decode("unicode-escape") + '\n'  
 
       sites_odd = sel.css('table.tablelist tr.odd')  
       for site in sites_odd:  
           item = TencentItem()  
           item['name'] = site.css('.l.square a').xpath('text()').extract()  
           relative_url = site.css('.l.square a').xpath('@href').extract()[0]  
           item['detailLink'] = urljoin_rfc(base_url, relative_url)  
           item['catalog'] = site.css('tr > td:nth-child(2)::text').extract()  
           item['workLocation'] = site.css('tr > td:nth-child(4)::text').extract()  
           item['recruitNumber'] = site.css('tr > td:nth-child(3)::text').extract()  
           item['publishTime'] = site.css('tr > td:nth-child(5)::text').extract()  
           items.append(item)  
           #print repr(item).decode("unicode-escape") + '\n'  
 
       info('parsed ' + str(response))  
       return items  
Пример #14
0
    def parse_celebrity_detail(self, response):
        """
        爬取艺人详情
        """
        celebrity = response.meta["celebrity"]
        sel = Selector(response)

        for dt in sel.css("div.per_info_l dt"):
            title = dt.css("::text").extract()[0]
            if title == "出生日期:":
                text = dt.css("::text").extract()[1].rstrip(")")
                if "(" in text:
                    birthday, birthplace = text.split("(", 1)
                else:
                    birthday, birthplace = text, ""
                celebrity["birthday"] = birthday
                celebrity["birthplace"] = birthplace
            elif title == "血型:":
                celebrity["blood"] = dt.css("::text").extract()[1]
            elif title == "星座:":
                celebrity["constellation"] = dt.css("::text").extract()[1]
            elif title == "身高:":
                celebrity["height"] = int(dt.css("::text").extract()[1].rstrip("cm"))
            elif title == "体重:":
                celebrity["height"] = int(dt.css("::text").extract()[1].rstrip("kg"))

        celebrity["intro"] = "\n".join(sel.css("div#lblAllGraphy p::text").extract())
        return celebrity
 def getInfo(self, response):
     sel = Selector(response)
     item = response.request.meta['item']
     item['title'] = sel.css('#content-middle h1::text').extract()[0]
     item['description'] = sel.css("#content-middle .node .content p ::text").extract()
     item['url'] = response.url
     return item
Пример #16
0
    def parse(self,response):
        sel = Selector(response)
        base_url = get_base_url(response)
        urls = sel.css('a')
        for url in urls:
            print url.xpath('@href').extract()[0]

        total_item = sel.css("#LIST_PAGINATION_COUNT")
        if len(count_item) > 0:
            total_count = count_item.xpath("text()").extract()[0]
            list_url_tuple = os.path.split(base_url)
            for i in total_count:
                url = list_url_tuple[0] + '/i/' + str(total_count) + "/" + list_url_tuple[1]
                print url

        if len(sel.css(".display_news_con")) > 0:
          
            info = []
            contents = sel.css(".display_news_con")
            title = contents.css(".atitle").xpath("text()").extract()[0]
            posttime = contents.css(".posttime").xpath("text()").extract()[0]
            items = posttime.split("\r\n")

            temp_submit_time = item[0].split(":")
            info['submit_time'] = temp_submit_time[1] + temp_submit_time[2]
            temp_publish_time = item[1].split(":")
            info['publish_time'] = temp_publish_time[1] + temp_publish_time[2]
            info['department'] = item[2].split(":")[1]
            info['content'] = contents.css(".entry").extract()[0]
            info['last_modified'] = response.headers['Last-Modified']

            return info
Пример #17
0
    def parse2(self, response):
        tv = response.meta['tv']

        sel = Selector(response)
        tv.origin_url = response.url

        p_dirsort = sel.css('div#main-rt div.mod-datum p.dirsort')
        for p in p_dirsort:
            p_type = p.css('::text').extract()[0]

            if u'导演' in p_type:
                tv.director = ''.join(p.css('span::text').extract())
            elif u'主演' in p_type:
                tv.starring = ''.join(p.css('span::text').extract())
            elif u'类型' in p_type:
                tv.category = ''.join(p.css('span::text').extract())

        tv.detail = sel.css('div#main-rt div.mod-datum p.dirtext span:nth-child(2)::text').extract()[0]

        print tv.name, '------->', tv.origin_url

        # 表明动漫
        tv.type = 2

        db_session.add(tv)
        db_session.commit()

    #     sub_tv_list = sel.css('div#playCont div div div div.torrent-panel ul li')
    #
    #     for st in sub_tv_list:
    #         try:
    #             st.css('a span').extract()[0]
    #         except IndexError:
    #             sub_tv_index = st.css('::attr(data-idx)').extract()[0]
    #         else:
    #             continue
    #
    #         sub_tv = SubFunViedo(fv_id=tv.id, index=sub_tv_index)
    #         sub_tv.id = st.css('::attr(data-vid)').extract()[0]
    #         sub_tv.origin_url = 'http://www.fun.tv{}'.format(st.css('a::attr(href)').extract()[0])
    #
    #         print sub_tv.index, '-------->', sub_tv.origin_url
    #
    #         request1 = Request(sub_tv.origin_url, callback=self.parse3)
    #         request1.meta['sub_tv'] = sub_tv
    #         yield request1
    #
    # def parse3(self, response):
    #
    #     print 'parse 3 ------->'
    #
    #     sub_tv = response.meta['sub_tv']
    #
    #     sel = Selector(response)
    #     play_count = sel.css('div.playInfo.crumbs div.rightBtn.fix a::text').extract()[0]
    #
    #     sub_tv.play_count = ''.join(play_count[3:].split(','))
    #
    #     db_session.add(sub_tv)
    #     db_session.commit()
Пример #18
0
    def parse(self, response):
        print "%d" % (response.request.cili006searchid)
        sel = Selector(response)

        item = Cili006Item()

        arrtitle = sel.css('div.desc-title')
        if len(arrtitle) <= 0:
            return []

        title = sel.css('div.desc-title')[0]
        print title.extract()
        item['filename'] = title.xpath('./h2/text()')[0].extract()
        print item['filename']
        #emindex = item['filename'].find('<em>')
        #print emindex
        #item['filename'] = item['filename'][0:emindex].strip()
        #print item['filename']

        item['magnet'] = ''
        item['ed2k'] = ''
        item['topic_id'] = response.request.cili006searchid

        arr = sel.css('div.desc-list-item')
        for cur in arr:
            ah = cur.xpath('./div[@class="t"]/a/@href')[0].extract()
            if ah.find('magnet') == 0:
                item['magnet'] = ah
            elif ah.find('ed2k') == 0:
                item['ed2k'] = ah

        self.moviedb.addMovie_cili006(item)

        return []
Пример #19
0
 def parse(self, response):
     items = []
     sel = Selector(response)
     sites_even = sel.css('a.j_th_tit')
     furl = sel.xpath("/html/head/meta[2][@furl]")
     for site in sites_even:
         item = JpanList_Items()
         item['title'] = site.xpath('text()').extract()
         item["link"] = site.xpath('@href').extract()
         item["furl"] = furl.x('@furl').extract()
         item['bid'] = item["link"][0].replace("/p/",'')
         #print item;
         items.append(item)
         print item['bid']
         jRedisco = JRedisco(bid=item['bid'])
         if jRedisco.is_valid():
             jRedisco.save()
             print 'jRedisco.save()'
         else:
             print 'jRedisco.is_valid'
     nextUrl = "http://tieba.baidu.com" + sel.css("a.next::attr(href)").extract()[0]
     print 'Next Page :' + nextUrl
     #self.insert(items)
     request = scrapy.Request(nextUrl,
                          callback=self.parse)
     #return items
     return request
Пример #20
0
 def parse(self, response):
     '''
     cmd = 'phantomjs constructDom.js "%s"' % response.url
     stdout,stderr = subprocess.Popen(cmd,shell= True,stdout = subprocess.PIPE,stderr = subprocess.PIPE).communicate()
     f = file('code.txt', 'w+')
     f.writelines(stdout)
     #print (stdout)
     sel = Selector(text=stdout)
     '''
     sel = Selector(response)
     csrfToken = sel.css("input#j-csrf::attr(value)").extract()[0].strip()
     name = "".join(sel.css('h1.m-source-title::text').extract()).strip()
     bookId = response.url.split("/")[-1]
     item = CartoonItem()
     item['name'] = "".join(sel.css('h1.m-source-title::text').extract()).strip()
     item['url'] = response.url
     item['hitNum'] = "".join(sel.css('div.g-cols--float>div.g-col:nth-of-type(1)>div.metadata:nth-of-type(2)::text').re(u'人气\:(.*)')).strip()
     searchObj = re.search(u'(.*)万', item['hitNum'])
     if searchObj:
         item['hitNum'] = int(float(searchObj.group(1)) * 10000)
     else:
         item['hitNum'] = int(item['hitNum'])
     item['collectionNum'] = -1
     item['likeNum'] = -1
     item['caiNum'] = -1
     item['webName'] = "网易漫画"
     item['crawlTime'] = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
     commentApiUrl = "http://manhua.163.com/comment/"+bookId+"/comments?csrfToken="+csrfToken+"&bookId="+bookId+"&page=1"
     request = scrapy.Request(commentApiUrl, callback = self.moreparse)
     request.meta['item'] = item
     return request
Пример #21
0
    def parse2(self, response):
        movie = response.meta['movie']

        sel = Selector(response)
        origin_url = response.url
        director = ''.join(sel.css('div#main-rt div.mod-datum p:nth-child(2) span::text').extract())
        starring = ''.join(sel.css('div#main-rt div.mod-datum p:nth-child(3) span::text').extract())
        detail = sel.css('div#main-rt div.mod-datum p.dirtext span:nth-child(2)::text').extract()[0]
        category = ''.join(sel.css('div#main-rt div.mod-datum p:nth-child(4) span::text').extract())
        play_count = sel.css('div.playInfo.crumbs div.rightBtn.fix a::text').extract()[0]

        print movie.name, '------->', origin_url, '------->', play_count

        if play_count:
            play_count = ''.join(play_count[3:].split(','))

            movie.origin_url = origin_url
            movie.director = director
            movie.starring = starring
            movie.detail = detail
            movie.category = category
            movie.play_count = play_count

            # if movie.play_count.strip() not in [0, '0']:
            #     for f in ['fluent', 'normal', 'high', 'super']:
            #         mp4_url = get_funtv_mp4(origin_url, f)
            #         if mp4_url:
            #             column_name = 'mp4_url_{}'.format(f)
            #             setattr(movie, column_name, mp4_url)
            #             movie.enable = 1

            db_session.add(movie)
            db_session.commit()
Пример #22
0
	def parse_view(self, response):
		info('parsed_view ' + str(response))
		items = []
		sel_view = Selector(response)
		sel_goto = Selector(response)
		sel_fenlei = Selector(response)
		sites_view = sel_view.css('li div a[href*=view]')
		sites_goto = sel_goto.css('a[href*=gotoList]')
		sites_fenlei = sel_fenlei.css('span a[href*=fenlei]')
		#for site in sites_view:	
		#	item = baikeSiteItem()
		#	item['url'] = site.css('::attr(href)')[0].extract()
		#	items.append(item)	
		#for site in sites_goto:	
		#	item = baikeSiteItem()
		#	item['url'] = site.css('::attr(href)')[0].extract()
		#	items.append(item)	
		#for site in sites_fenlei:
		#	item = baikeSiteItem()
		#	item['url'] = site.css('::attr(href)')[0].extract()
		#	items.append(item)
		for site in sites_view:	
			url = self.domains[0]+site.css('::attr(href)')[0].extract()[0]
			yield Request(url, callback=self.parse_word)
				
		for site in sites_goto:	
			url = self.domains[0]+site.css('::attr(href)')[0].extract()[0]
			yield Request(url, callback=self.parse_word)

		for site in sites_fenlei:
			url = self.domains[0]+site.css('::attr(href)')[0].extract()[0]
			yield Request(url, callback=self.parse_word)
 def parsePost(self,response):
     logging.info(response)
     sel = Selector(response)
     posts = sel.css(".messageList").xpath('./li')
     items = []
     if len(posts)==0:
         return items
     topic = sel.css('.titleBar').xpath('./h1/text()').extract()[0].strip()
     url = response.url
     for post in posts:
         item = PostItemsList()
         if len(post.css('.userText'))>0:
             item['author'] = post.css('.userText').xpath('./a/text()').extract()[0]
             item['author_link']=response.urljoin(post.css('.userText').xpath('./a/@href').extract()[0])
         else:
             continue
         item['create_date']= self.parseText(str=post.css('.DateTime').extract()[0])
         post_msg= self.parseText(str=post.css('.messageText').extract()[0])
         item['post']=post_msg
         item['tag']='rheumatoid arthritis'
         item['topic'] = topic
         item['url']=url
         logging.info(post_msg)
         items.append(item)
     return items
Пример #24
0
    def parse(self, response):
        #print  response.url
        sel = Selector(response)
        title = ''
        
        tit_finders = ['.header h1', '.lede-headline', '.title', '#article-headline']
        for finder in tit_finders:
            try:
                title = sel.css('%s::text' %finder)[0].extract()
                break
            except Exception as e:
                pass
                
        con_finders = ['.body', '.article-body__content', '#article_body', '#article-body']
        for finder in con_finders:
            try:
                content = sel.css('%s' %finder)[0].extract()
                break
            except Exception as e:
                pass
                
        dat_finders = ['cite abbr', 'time', '.timestamp span']
        for finder in dat_finders:
            try:
                date = sel.css('%s::text' %finder)[0].extract()
                break
            except Exception as e:
                pass

        yield News(url=response.url, title=title, content=content, date=date)        
Пример #25
0
	def parse2(self,response):		
		url = response.meta['url']
		driver = webdriver.PhantomJS(executable_path = '/Users/hantianyan/phantomjs-1.9.8-macosx/bin/phantomjs', service_args = self.service_args)
		driver.get(response.url)
		sel = Selector(text = driver.page_source)
		item = MoreinfoItem()
		release_list = sel.css('.extra-info').xpath('./text()').extract()
		if len(release_list) > 0:
			item['release_time'] = release_list[0].encode('utf-8').split(':')[2]
			item['popularity'] = sel.css('#star_greet').xpath('./@style').re('\d+')[0].encode('utf-8')
			item['comfort'] = sel.css('#star_comfort').xpath('./@style').re('\d+')[0].encode('utf-8')
			nearsel = sel.css('.nearbox')
			item['transport'] = 0
			transport_list = nearsel.xpath(".//div[@data-attr='traffic']").css('.p_star_s').xpath('./@style').re('\d+')
			if len(transport_list) > 0:
				item['transport'] = transport_list[0].encode('utf-8')
			item['hospital'] = 0
			hospital_list = nearsel.xpath(".//div[@data-attr='hospital']").css('.p_star_s').xpath('./@style').re('\d+')
			if len(hospital_list) > 0:
				item['hospital'] = hospital_list[0].encode('utf-8')
			item['education'] = 0
			education_list = nearsel.xpath(".//div[@data-attr='school']").css('.p_star_s').xpath('./@style').re('\d+')
			if len(education_list) > 0:
				item['education'] = education_list[0].encode('utf-8')
			item['business'] = 0
			business_list = nearsel.xpath(".//div[@data-attr='commerce']").css('.p_star_s').xpath('./@style').re('\d+')
			if len(business_list) > 0:
				item['business'] = business_list[0].encode('utf-8')
			sql = "update second_house_table set release_time = '%s',popularity = '%s',comfort = '%s',transport = '%s',hospital = '%s',education = '%s',business = '%s' WHERE url = '%s' " % \
			(item['release_time'],item['popularity'],item['comfort'],item['transport'],item['hospital'],item['education'],item['business'],url)
			self.cursor.execute(sql)
			self.db.commit()
Пример #26
0
    def parse_content(self, response):
        sel = Selector(response)
        item = JobpostcrawlingItem()
        item["company_name"] = sel.xpath('//div[@class="ad-content-header"]/h1/text()').extract()[0]

        if match(r".*organisation-profile", response.url):
            # procedures for company pages
            target = sel.css(".hreview-aggregate > p")
            target.extend(sel.css(".hreview-aggregate > ul > li"))
            item["company_description"] = "\n".join("".join(p.xpath(".//text()").extract()) for p in target)
            item["url"] = response.url
            yield item
        else:
            # procedures for job post pages
            try:
                save_content = sel.xpath('//div[@id="save-content"]/a/text()').extract()[0]
                organisation_link = sel.xpath('//p[@class="organisation-link"]/a/text()').extract()[0]
                item["job_name"] = sel.xpath('//div[@class="main-content-core"]/h2/text()').extract()[0]
                target = sel.css(".hreview-aggregate > p")
                target.extend(sel.css(".hreview-aggregate > ul > li"))
                item["job_description"] = "\n".join("".join(p.xpath(".//text()").extract()) for p in target)
                item["url"] = response.url
                yield item
            except:
                pass
Пример #27
0
    def parse(self, response):      
        sel = Selector(response)
        links_to_annonces = sel.css('div[class="list-lbc"]').xpath('a/@href').extract()
        links_to_annonces = [a.encode('ascii').rstrip() for a in links_to_annonces]

        print response.url

        for link in links_to_annonces:
            # self.parseAnnonce(link)
            # print link
            item = AnnonceItem()
            yield Request(urlparse.urljoin(response.url, link), 
                          meta={'item':item},
                          callback=self.parse_annonce)
            # if 1: break

        # next page
        link_url = None
        links = sel.css('li[class="page"]')

        for link in links:
            link_text = link.xpath('a/text()').extract()
            print link_text
            if len(link_text) and link_text[0].find('suivante'):
                link_urls = link.xpath('a/@href').extract()
                if len(link_urls):
                    link_url = link_urls[0]
        if link_url:
            yield Request(urlparse.urljoin(response.url, link_url), 
                          meta={},
                          callback=self.parse)
Пример #28
0
    def parse(self, response):
        if response.status == 404:
            # 如果返回404,可以直接返回,不需要处理
            logging.info("response.status:" + response.status)
            return
        
        select = Selector(response)
        if "data" in response.meta:
            isNextPage = response.meta["data"]
        else:
            isNextPage = "firstPage"
        
        question_id = self.digitalPattern.findall(response.url)[0]

        
        # 只在第一页取标题
        if isNextPage == "firstPage":
            item = TobosuItem()
            item["question_id"] = question_id
            item["question_title"] = select.css(".aqq-title").xpath(".//h1/text()").extract()[0]
            try:
                item["question_description"] = select.css(".des").extract()[0][15:-4].strip()
            except Exception, e:
                item["question_description"] = ""
                print e
        
            try:
                big_category = ",".join(select.css(".recom-lab").xpath(".//a/text()")[1:].extract())
            except Exception, e:
                big_category = ""
                print e
Пример #29
0
 def parse_item(self, response):  # 提取数据到Items里面,主要用到XPath和CSS选择器提取网页数据
     items = []
     sel = Selector(response)
     base_url = get_base_url(response)
     sites_even = sel.css("table.tablelist tr.even")
     for site in sites_even:
         item = TutorialItem()
         item["name"] = site.css(".l.square a").xpath("text()").extract()
         relative_url = site.css(".l.square a").xpath("@href").extract()[0]
         item["detailLink"] = urljoin_rfc(base_url, relative_url)
         item["catalog"] = site.css("tr > td:nth-child(2)::text").extract()
         item["workLocation"] = site.css("tr > td:nth-child(4)::text").extract()
         item["recruitNumber"] = site.css("tr > td:nth-child(3)::text").extract()
         item["publishTime"] = site.css("tr > td:nth-child(5)::text").extract()
         items.append(item)
         # print repr(item).decode("unicode-escape") + '\n'
     sites_odd = sel.css("table.tablelist tr.odd")
     for site in sites_odd:
         item = TutorialItem()
         item["name"] = site.css(".l.square a").xpath("text()").extract()
         relative_url = site.css(".l.square a").xpath("@href").extract()[0]
         item["detailLink"] = urljoin_rfc(base_url, relative_url)
         item["catalog"] = site.css("tr > td:nth-child(2)::text").extract()
         item["workLocation"] = site.css("tr > td:nth-child(4)::text").extract()
         item["recruitNumber"] = site.css("tr > td:nth-child(3)::text").extract()
         item["publishTime"] = site.css("tr > td:nth-child(5)::text").extract()
         items.append(item)
     # print repr(item).decode("unicode-escape") + '\n'
     info("parsed " + str(response))
     return items
Пример #30
0
 def parse(self, response):
     sel = Selector(response)
     item = FeatureItem()
     item["title"] = sel.css(".title::text").extract()
     item["link"] = response.url
     item["reviews"] = sel.css(".review-text::text").extract()
     return item
Пример #31
0
 def parse_img_link(self, response):
     selector = Selector(response=response)
     homepage_id = self.homepage_id
     page_urls = filter(lambda x: "/s/" in x,
                        selector.css("a::attr(href)").extract())
     for url in set(page_urls):
         yield scrapy.Request(url=url,
                              cookies=self.cookies,
                              callback=self.parse_img_url)
Пример #32
0
    def parse(self, response):
        selector = Selector(response)

        for post in selector.css('article.post'):
            print('---------------------------------')

            loader = ItemLoader(BlogPost(), post)
            loader.add_css('name', '.entry-title > a::text')
            yield loader.load_item()
Пример #33
0
 def parse_more_topics(self, response):
     logging.warn('FOUND A ADDED TOPICS')
     json_response = json.loads(response.text)
     html_selector = Selector(text=json_response['value']['html'])
     item = response.meta['item']
     item['topics'] = ','.join(
         [item['topics']] +
         list(html_selector.css(self.more_topic_name_css).extract()))
     yield item
Пример #34
0
    def parse_proxy(self, response):
        # url = "https://hidemyna.me/en/proxy-list/?maxtime=1000&type=h&start=0"
        # self.driver.get(url)
        # # time.sleep(20)
        # element = WebDriverWait(self.driver, 30).until(
        # EC.presence_of_element_located((By.CSS_SELECTOR, "div.allcountries__bl")))

        # html = self.driver.page_source
        # sel = Selector(text=html)
        # all_countries = sel.css('div.allcountries__bl label span.flag-icon::attr(class)').extract()
        # self.logger.info(html)
        # print([country.split('-icon-')[-1] for country in all_countries])
        # self.logger.info(all_countries)
        countries = {}
        for i in range(7):
            skip = i * 64
            url = 'https://hidemyna.me/en/proxy-list/?country=UADEARALINBGBRBDCACZUSGBHUIDNLRUESFR&type=h&maxtime=1000&start={}#list'.format(
                skip)
            print(url)
            self.driver.get(url)
            element = WebDriverWait(self.driver, 30).until(
                EC.presence_of_element_located(
                    (By.CSS_SELECTOR, "div.allcountries__bl")))

            html = self.driver.page_source
            sel = Selector(text=html)
            for r in sel.css('tbody tr'):
                p = Proxy()
                proxy = r.xpath("td[1]/text()").extract_first()
                port = r.xpath("td[2]/text()").extract_first()
                country = r.xpath("./td[3]").css('div::text').extract_first()
                country_alt_raw = r.xpath("./td[3]").css(
                    'span::attr(class)').extract_first()
                # print(country_alt_raw)
                if country_alt_raw:
                    country_alt = country_alt_raw.split('icon-')[-1]
                    if country_alt in countries:
                        countries[country_alt] += 1
                    else:
                        countries[country_alt] = 0

                else:
                    country_alt = None
                speed = r.css('div.bar p::text').extract_first()
                p_type = r.xpath("./td[5]/text()").extract_first()
                p['proxy'] = proxy
                p['port'] = port
                p['country'] = country
                p['country_alt'] = country_alt
                p['speed'] = speed
                p['protocol'] = p_type
                if p_type and country_alt in self.countries_max:
                    if countries[country_alt] <= self.countries_max[
                            country_alt]:
                        yield p
            time.sleep(10)
        self.logger.info(countries)
Пример #35
0
    def search_xls_link_inpage(self, response):
        sel = Selector(response)
        url_page = sel.css('#main-interno ul li a::attr(href)').extract()

        if len(url_page) == 0:
            return Request(self.all_links.pop(0), callback=self.open_operational_data_label)

        new_url = self.url_base_2 + url_page[0]
        return Request(new_url, callback=self.parse_xls)
    def parse(self, response):
        sel = Selector(response)

        item = ProblemItem()
        item['origin_oj'] = 'sdut'
        item['problem_id'] = self.problem_id
        item['problem_url'] = response.url
        item['title'] = sel.xpath('//center/h2/text()').extract()[0]
        item['description'] = sel.css('.pro_desc').extract()[0]
        item['input'] = sel.css('.pro_desc').extract()[1]
        item['output'] = sel.css('.pro_desc').extract()[2]
        item['time_limit'] = sel.xpath('//a/h5/text()').re('T[\S*\s]*s')[0][12:]
        item['memory_limit'] = \
            sel.xpath('//a/h5/text()').re('M[\S*\s]*K')[0][14:]
        item['sample_input'] = sel.xpath('//div[@class="data"]/pre').extract()[0]
        item['sample_output'] = sel.xpath('//div[@class="data"]/pre').extract()[1]
        item['update_time'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        return item
Пример #37
0
    def parse(self, response):
        pageSelector = Selector(response)
        objects = pageSelector.css('.location')
        objects.extract()

        for index, object in enumerate(objects):
            objectUrl = Extractor.url(response, object,
                                      'h2.heading-sm > a::attr(href)')
            yield scrapy.Request(objectUrl, self.parse_object)
Пример #38
0
 def parse_img_url(self, response):
     selector = Selector(response=response)
     image_link = selector.css("#img::attr(src)").extract()[0]
     item = ExhentaiItem()
     filename = image_link.strip().split("/")[-1]
     item['image_urls'] = image_link
     item["image_paths"] = os.path.join(self.title, filename)
     item["image_title"] = self.title
     yield item
Пример #39
0
    def get_xls(self, response):
        """Link to generate XLS is hide in a js script."""
        sel = Selector(response)
        res = sel.css('script').re('"ExportUrlBase":"(.*?)",')
        if res:
            res_str = self.base_url + res[0].replace('\\u0026', '&') + 'Excel'
            return Request(res_str, self.parse_xls)

        return None
Пример #40
0
 def parse_item(self, response):
     selector = Selector(response)
     #sélection du lien officiel dans les onglets
     url = selector.xpath(
         '//div[@id="infos"]/p[@class="officiel"]/a/@href').extract()
     if len(url) == 1:  # on l'a trouvé, on le suit
         item = response.meta['item']
         request = Request("http://www.boamp.fr" + url[0],
                           callback=self.parse_official_link)
         request.meta['item'] = item
         return request
     else:  # il n'existe pas, on tente notre chance sur la page courrant qui est souvent celle au format officiel
         if len(selector.css(".officielOnly").extract()) == 1:
             item = response.meta['item']
             selector = Selector(response)
             html = selector.css("#avisOfficiel").extract()
             references = selector.css("#references").extract()
             return self.extract_data(html, references, response.url, item)
Пример #41
0
 def _past_jobs_processor(node_list):
     if not node_list:
         return
     selector = Selector(text=node_list[0])
     title = selector.css('.title::text').extract()
     company_url = selector.xpath('//a/@href').extract()
     start = selector.xpath('//div[@class="cell date"][1]/text()').extract()
     end = selector.xpath('//div[@class="cell date"][2]/text()').extract()
     return zip(title, company_url, start, end)
Пример #42
0
 def parse_recommended_products(self, response):
     # Scrape similar products
     sel = Selector(response)
     url_paths = sel.css(
         'article.top-products .content>a::attr(href)').extract()
     for url_path in url_paths:
         request = WebdriverRequest(url_path, callback=self.parse_product)
         self.prep_product_tagging(request, response.meta.get('item'))
         yield request
Пример #43
0
    def parse(self, response):
        # path_page_activate = 'div.pagcomment span.active'
        # path_next_page_numb = 'div.pagcomment span.active + a::text'

        path_list_QA = 'li.comment_ask'
        path_comment_id = 'li.comment_ask::attr(id)'

        path_object_id = 'div.wrap_comment::attr(detailid)'

        if response.css(path_object_id).extract_first() is not None:
            self.objectid = response.css(path_object_id).extract_first()

        str_numb_page = 0
        try:
            str_numb_page = response.css('ul.listcomment div.pagcomment span'
                                         )[-2].css('::text').extract_first()
        except ():
            str_numb_page = 1

        for page_numb in range(1, int(str_numb_page) + 1):
            try:
                formdata = {
                    'core[call]': 'cmt.listpaging',
                    'objectid': self.objectid,
                    'objecttype': '6',
                    'pageindex': str(page_numb),
                    'order': '1',
                }
                print("formdata: ")
                print(formdata)
                res_script = requests.post(self.url_api_list_comment,
                                           data=formdata).text
                struct_text = res_script.replace(self.start_replaced_str,
                                                 '').replace(
                                                     self.end_replaced_str, '')
                selector = Selector(text=struct_text)

                for qa in selector.css(path_list_QA):
                    if len(qa.css('div.listreply div.reply')) >= 1:
                        yield {
                            'id_cmt':
                            qa.css(path_comment_id).extract_first(),
                            'question':
                            qa.css('div.question::text').extract_first(),
                            # 'answer': ''.join(qa.css('div.listreply div.reply')[0].css('div.cont::text').extract()),
                            'answers': [
                                ''.join(reply.css('div.cont::text').extract())
                                for reply in qa.css('div.listreply div.reply')
                            ],
                            # 'time': qa.css('li.comment_ask a.time::text').extract_first(),
                            # 'user_name': qa.css('li.comment_ask div.rowuser a strong::text').extract_first(),
                            # 'replier_name': qa.css('li.comment_ask div.rowuser a strong::text').extract_first(),
                        }
                    else:
                        continue
            except Exception as e:
                print(e)
Пример #44
0
def crawl_ips():
    # 爬取西刺得免费ip代理
    headers = {
        "User-Agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"
    }
    for i in range(2354):
        re = requests.get("http://www.xicidaili.com/nn/{0}".format(i),
                          headers=headers)

        # print(re.text)
        selector = Selector(text=re.text)
        # all_trs = selector.css("#ip_list  tr[class]:not([class='subtitle'])")
        all_trs = selector.css("#ip_list tr")

        ip_list = []

        for tr in all_trs[1:]:
            speed_str = tr.css(".bar::attr(title)").extract()[0]
            if speed_str:
                speed = float(speed_str.split("秒")[0])
            # ip = tr.css("td:nth-child[2]::text").extract()[0]  # 报错
            all_text = tr.css("td::text").extract()
            ip = all_text[0]
            port = all_text[1]
            proxy_type = all_text[5]

            # lis = (ip, port, speed, proxy_type)
            # lis = list(map(lambda a: str(a) if type(a) != 'str' else a, (ip, port, speed, proxy_type)))
            # print(':'.join(lis))

            ip_list.append((ip, port, speed, proxy_type))

            # print(all_trs)
        # for tr in all_trs:
        #     # print(tr.extract())
        #     # ip = tr.xpath('/td[2]/text()').extract()
        #     # port = tr.xpath('/td[3]/text()').extract()
        #     # http_type = tr.xpath('/td[6]/text()').extract()
        #     ip = tr.css('td:nth-child(2)::text').extract()[0]
        #     port = tr.css('td:nth-child(3)::text').extract()[0]
        #     speed = tr.css('td:nth-child(6)::text').extract()[0]
        #     proxy_type = tr.css('td:nth-child(6)::text').extract()[0]
        #     # print(ip, port)
        #     # print(':'.join((str(ip), str(port), str(http_type))))
        #     print(':'.join((ip, port, speed, proxy_type)))
        #     ip_list.append((ip, port, speed, proxy_type))

        print(": ".join(ip_info))

        for ip_info in ip_list:
            cursor.execute(
                "insert into proxy_ip(ip, port, speed, proxy_type) VALUES ('{0}','{1}',{2},'{3}')"
                .format(ip_info[0], ip_info[1], ip_info[2],
                        ip_info[3]))  # 传递字符串一定要加单引号

        conn.commit()
Пример #45
0
    def parse_new_page(self, response):
        #for sel in response.xpath('//ul/li'):
        #   title = sel.xpath('a/text()').extract()
        #  link = sel.xpath('a/@href').extract()
        # desc = sel.xpath('text()').extract()
        #print title, link, desc
        item = ExxNewsItem()
        sel = Selector(response)
        title = sel.css("#blog > div > h2").extract()
        content = sel.css('''#blog > div > div''').extract()

        print title, content

        item["url"] = response.url
        item['title'] = self.process_item(title)
        item['content'] = self.process_item(content)

        yield item
Пример #46
0
    def parse_item(response):
        sel = Selector(response)
        url = response.request.url
        if re.match(r'.*?/\d{4}-\d{2}-\d{2}/.*?html', url):

            print('---------------------')
            print(url)

            content = response.xpath(
                '/html/body/div[1]/div[2]/div[1]/article/div[1]/p//text()'
            ).extract()
            print(content)
            # 移除编辑
            editor = response.xpath(
                '//*[@class="-articleeditor"]/text()').extract_first()
            if editor:
                content.remove(editor)
            publish_time = sel.re(r'\d{4}-\d{2}-\d{2}.*?\d{2}:\d{2}:\d{2}')[0]
            print(publish_time)
            if ' ' in publish_time:
                publish_time = publish_time.replace(' ', '')

            if content:
                item = NewsItem(
                    domainname='http://ti.tibet3.com/',
                    chinesename='tibet3',
                    url=sel.root.base,
                    title=sel.css('.entry-header > h1:nth-child(1)::text'
                                  ).extract_first(),
                    subtitle=sel.css('.sub::text').extract_first(),
                    language='藏文',
                    encodingtype='utf-8',
                    corpustype='网络',
                    timeofpublish=publish_time,
                    content=''.join(content),
                    author=None)
                print(item.get("title", None))
                print(item.get("timeofpublish", None))
                print(item.get("source", None))
                print(item.get("author", None))
                # yield item
                # item = judge_time_news(item)
                # if item:
                yield item
Пример #47
0
    def parse_download(self, response):
        '''The download page (usually) offers multiple download links, we want just the update.'''

        sel = Selector(response)

        link_notes = None
        link_bios = None

        links = sel.css('a').xpath('@href').extract()
        for link in links:
            ### Release notes are cool too, though they are in PDF form.
            if link.find("ReleaseNotes") >= 0:
                link_notes = link
            if link.find(".BIO") >= 0:
                link_bios = link

        if link_bios is None:
            return

        item = IntelBiosUpdatePageItem()
        link_bios = link_bios[link_bios.find("httpDown=") +
                              len("httpDown="):link_bios.find(".BIO") +
                              len(".BIO")]
        item['bios_url'] = link_bios
        item['notes_url'] = link_notes if link_notes is not None else ""

        ### Supported products is nice too.
        products = []
        products_sel = sel.css('div#prodos')
        if len(products_sel) > 0:
            products_sel = products_sel.xpath(
                ".//table/tr/td/text()").extract()
            for product in products_sel:
                products.append("".join(
                    [c for c in product if c not in ['\t', '\n', '\r']]))
        item['products'] = products
        item['attrs'] = dict(response.meta['attrs'])
        item['item_id'] = item['attrs']['item_id']

        #yield item
        yield Request(url=link_bios,
                      callback=self.parse_binary,
                      meta={"item": item})
        pass
Пример #48
0
def parse_one_page(html):
    sel = Selector(html)
    books_lists = sel.css(' div ul li')
    books_lists = books_lists[16:]
    for book in books_lists:
        book_ids = book.xpath("a/@href").extract_first().strip().replace("https://book.douban.com/subject/",'').replace('/', '')
        book_img_url = book.xpath("a[@class='cover']/img/@src").extract_first().strip()
        id_books.append(book_ids)
        url_img_books.append(book_img_url)
    books = sel.css('ul li div')
    book2 = books[:20]
    for book in book2:
        book_title = book.xpath("h2/a/text()").extract_first()
        book_publis = book.xpath("p[@class='color-gray']/text()").extract_first().replace(' ', '').strip()
        book_intro = book.xpath("p[@class='detail']/text()").extract_first().replace(" ", "").strip()

        book_2 = book_publis.split('/')
        bookauthor = book_2[0]
        bookpub = book_2[-2]
        book_date = book_2[-1]

        title_books.append(book_title)
        author_books.append(bookauthor)
        publis_books.append(bookpub)
        date_books.append(book_date)
        intro_books.append(book_intro)

    # print(bookauthor+bookpub+book_date)
    books1 = books[20:]
    for book in books1:
        book_title = book.xpath("h2/a/text()").extract_first()
        book_publis = book.xpath("p[@class='color-gray']/text()").extract_first().replace(' ', '').strip()
        book_2 = book_publis.split('/')

        bookauthor = book_2[0]
        bookpub = book_2[-2]
        book_date = book_2[-1]
        book_intro = book.xpath("p[3]/text()").extract_first().replace(" ", "").strip()

        title_books.append(book_title)
        author_books.append(bookauthor)
        publis_books.append(bookpub)
        date_books.append(book_date)
        intro_books.append(book_intro)
Пример #49
0
class DataclassLoader(ItemLoader, DataclassHelper):
    """
    Using the `ItemLoader` pollutes a dataclass declaration for a scrapy item.
    See: https://docs.scrapy.org/en/latest/topics/loaders.html#working-with-dataclass-items

    This loader allows to keep dataclasses pure and frozen. See `items.py`.

    In subclasses, you must override the `dataclass()` abstract property and
    define all methods needed to `populate()` objects, see the method below.
    """

    default_output_processor = TakeFirst()

    # The `ItemLoader` uses mutable dict under the hood.
    default_item_class = dict

    @property
    def response(self):
        return self.context['response']

    def __call__(self, response):
        self.update(response)
        self.populate()
        return self.load_item()

    def update(self, response):
        self.selector = Selector(response=response)
        self.context.update(selector=self.selector)
        self.context.update(response=response)

    def populate(self):
        """
        For each `self.field_names` calls the `self.<field_name>()` method to
        get the field value and store it internally for loading the item
        further.

        NOTE: using `replace_value()` instead of `add_value()` keeps the first
        item of the internal list is actual. So using the `TakeFirst` as the
        `default_output_processor` works correctly.
        """
        for name in self.field_names:
            self.replace_value(name, getattr(self, name)())

    def load_item(self):
        return self.dataclass(**super().load_item())

    def css_response(self, query):
        """
        Builds a `HtmlResponse` from a HTML text selected with the CSS `query`
        from `self.response`.

        It allows to call a `DataclassLoader` with a nested response similar
        to `ItemLoader.nested_css()`, but without instantiating the current
        class.
        """
        return html(self.selector.css(query).get())
Пример #50
0
    def parse_dir_name(self, response):
        """TODO: Docstring for parse_dir_name.

        :response: TODO
        :returns: TODO

        """
        sel = Selector(response)
        return sel.css('.forum_dir_info li:last-child a::text').extract(
        )[0].strip()  # format: 40,876
Пример #51
0
    def parse(self, response):
        paragraphs = json.loads(response.body_as_unicode())["aaData"]
        for paragraph, *_ in paragraphs:
            selector = Selector(text=paragraph)
            url = selector.css("p a ::attr(href)").extract_first()

            text = selector.css("p strong ::text")
            is_extra_edition = text.extract_first().startswith("Suplemento")
            date = text.re_first("\d{1,2} de \w+ de \d{4}")
            date = parse(date, languages=["pt"]).date()

            yield Gazette(
                date=date,
                file_urls=[url],
                is_extra_edition=is_extra_edition,
                territory_id=self.TERRITORY_ID,
                power="executive_legislature",
                scraped_at=dt.datetime.utcnow(),
            )
Пример #52
0
def img_url_from_page(url):
    html = requests.get(url).text

    sel = Selector(text=html)

    img_names = sel.css('td a img::attr(src)').extract()

    img_names = [img_name for img_name in img_names]

    return img_names
Пример #53
0
    def parse(self, response):
        selector = Selector(response)
        base_url = get_base_url(response)
        form_urls = selector.css(
            'div[class^="floor js"] li[class^="dir-item"] a[class="fwb"]::attr(href)'
        ).extract()

        for url in form_urls[:]:
            form_url = clean_url(base_url, url, response.encoding)
            yield Request(url=form_url, callback=self.parse_classify_form)
Пример #54
0
 def parse(self, response):
     print("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")
     print(response.request.headers['User-Agent'])
     print("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")
     resp = Selector(text=self.html)
     for row in resp.css('tbody tr[role="row"]'):
         yield {
             'fund_name': row.css('.text-left a::text').get(),
             'NAV': row.css('td:nth-child(10)::text').get()
         }
Пример #55
0
def store_page(url, page):
    sel = Selector(text=page)
    title = sel.css('title ::text').extract_first()
    main_html = sel.css('div[role=main]').extract_first()
    page_dict = {
        'objectID': url,
        'url': url,
        'fileType': 'html',
        'title': title,
        'source': specificGoogleSitesUrl,
        'service': 'gsites',
        'content': main_html,
        'organisationID': organisationID,
        'modified': calendar.timegm(time.gmtime()),  # Not ideal!!
        'created': calendar.timegm(time.gmtime()),  # Definitely not right!!!!
    }
    pp.pprint(page_dict)
    algoliaScrapedIndex.save_object(page_dict)
    return main_html
    def parse(self, response):
        sel = Selector(response)

        item = ProblemItem()
        item['origin_oj'] = 'hdu'
        item['problem_id'] = self.problem_id
        item['problem_url'] = response.url
        item['title'] = sel.xpath('//h1/text()').extract()[0]
        item['description'] = sel.css('.panel_content').extract()[0]
        item['input'] = sel.css('.panel_content').extract()[1]
        item['output'] = sel.css('.panel_content').extract()[2]
        item['time_limit'] = \
            sel.xpath('//b/span/text()').re('T[\S*\s]*S')[0][12:]
        item['memory_limit'] = \
            sel.xpath('//b/span/text()').re('Me[\S*\s]*K')[0][14:]
        item['sample_input'] = sel.xpath('//pre').extract()[0]
        item['sample_output'] = sel.xpath('//pre').extract()[1]
        item['update_time'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        return item
Пример #57
0
 def parse(self, response):
     detail_page_links = []
     for html_text in response.css('record *::text').getall():
         record = Selector(text=html_text)
         url = record.css('a::attr(href)').get()
         UID = url.split('/')[-1][:-5] + '_' + url.split(
             '/')[-4] + url.split('/')[-3] + url.split('/')[-2]
         detail_page_links.append(url)
         yield {
             'UID': UID,
             'title': record.css('a::attr(title)').get(),
             'date': record.css('b::text').get(),
             'FileNumber': None,
             'text length': 0,
             'url': url,
             'crawl state': 'half'
         }
     for url in detail_page_links:
         yield scrapy.Request(url=url, callback=self.parse_content)
Пример #58
0
def get_json(js):
    # 处理字符串,由于返回的岗位职责是一个包含html的json数据,需要处理一下
    if js:
        json_content = js.get('zpData').get('html')
        content = Selector(text=json_content)
        content_text = content.css(".detail-bottom-text::text").re(
            "[\u4e00-\u9fa5_a-zA-Z0-9]+")
        return content_text
    else:
        print("未获取数据")
Пример #59
0
    def parse(self, response):
        # 获取文章列表页的文章url,并交给解析函数进行具体字段的解析

        post_nodes = response.css(
            "#archive .floated-thumb .post-thumb a").extract()
        for post_node in post_nodes:
            post_node = Selector(text=post_node)
            image_url = post_node.css("img::attr(src)").extract_first("")
            post_url = post_node.css("::attr(href)").extract_first("")
            yield Request(url=parse.urljoin(response.url, post_url),
                          meta={"front_image_url": image_url},
                          callback=self.parse_detail)

        # 获取下一页的url,并交给scrapy进行下载
        next_urls = response.css(
            ".next.page-numbers::attr(href)").extract_first("")
        if next_urls:
            yield Request(url=parse.urljoin(response.url, post_url),
                          callback=self.parse)
Пример #60
0
    def parse_news_item(self, response):
        sel = Selector(response)
        news = NewsItem()

        news['url'] = response.url
        news['source'] = SOURCE_ID_NIC

        news['title'] = sel.css('.newstitle > h1::text').extract()[0]
        news['content'] = normalize_content(sel.css('#contentText ::text').extract())

        # 文章元数据居然人性地放在了不同的元素里, 不用上正则, 简直良心
        news['author'] = '未知'
        # 这里可能会出现 u'\xa0', 需要标准化掉
        news['publisher'] = normalize_content(sel.css('.newstitle > span > b:nth-child(3)::text').extract()[0])

        ctime_str = sel.css('.newstitle > span > b:nth-child(1)::text').extract()[0]
        news['ctime'] = strptime_helper(ctime_str, '%Y/%m/%d')

        return news