Пример #1
0
 def form_part_request(url, callback, part):
     request = Request(
         url=url,
         callback=callback
     )
     request.meta['part'] = part
     return request
Пример #2
0
    def parse(self, response):
        #翻页请求,每10页,停30秒
        self.pageNo += 1
        if self.pageNo % 2 == 0:
            time.sleep(25)
            
        select = Selector(response)
        if not "shopDetail" in response.meta:
            # 店铺列表页
            allNo = self.questionIdPatten.findall(response.url)
            cityId = allNo[0]  # cityid
            pageNumber = allNo[-1]
            
            #记录page
            self.fw.write("%s cityId:%s, pageNumber:%s\n" % (response.url, cityId, pageNumber))
            self.fw.flush()
            
            item = DianpingItem()
            item["city_id"] = cityId
            try:
                cityName = select.css(".city").xpath("./text()").extract()[0]
            except Exception,e:
                cityName=""
                print e
                
#             self.fw.write("%s\t%s\n"%(cityId, cityName))
#             self.fw.flush()

            yieldPageFlag = False
            shop_list = select.xpath(".//div[@class='info']")
            for li in shop_list:
                yieldPageFlag = True
                
                item["shop_name"] = li.xpath(".//p[@class='title']/a/text()").extract()[0]
                item["shop_cityname"] = cityName  # 地区
                # domain,当做标签,非区域,抓取区域指地区
                item["shop_domain"] = ",".join(li.xpath(".//p[@class='area-key']/span[@class='area-list']/a/text()").extract())
                key_list = ",".join(li.xpath(".//p[@class='area-key']/span[@class='key-list']/a/text()").extract())
                
                item["shop_tag"] = ",".join([key_list, item["shop_domain"]])  # 标签包含区域
                
                # href = '/shop/123456'
                href = li.xpath(".//p[@class='title']/a[@class='shopname']/@href").extract()[0]
                item["shop_id"] = href.split("/")[-1]
                
                shopUrl = "http://www.dianping.com" + href
                request = Request(shopUrl, callback=self.parse, priority=1234567)#店铺请求
                request.meta["shopDetail"] = copy.deepcopy(item)
                yield request
                
                pass
            
            if yieldPageFlag:
                # 如果当前页有数据,则继续请求下一页
                nextPageNumber = int(pageNumber) + 1
                
                url = self.pageUrl % (cityId, nextPageNumber)
                request = Request(url, callback=self.parse, priority=1234)
                yield request
            pass
Пример #3
0
 def parse_hiker_info(self, response):
     # TODO: Somehow obtain the Hiker's direction 'dir'.
     # TODO: Somehow obtain the Hiker's trail start date 'start_date'
     # TODO: Somehow obtain the Hiker's trail estimated end date 'end_date'
     print("Response received: %s" % response)
     print("Parsing Hiker Info from response: %s" % response)
     hiker = HikerItem()
     hiker['id'] = self.extract_hiker_id(response=response)
     hiker_name_xpath = "/html/body/table//tr[4]/td/table/tr//td[2]/table//tr[2]/td//font[2]"
     hiker_name = Selector(response=response).xpath(hiker_name_xpath).extract()[0]
     hiker_name_start = str.find(hiker_name, "-", 0, len(hiker_name))
     hiker_name_end = str.find(hiker_name, "<", hiker_name_start, len(hiker_name))
     hiker_name = hiker_name[hiker_name_start + 1:hiker_name_end]
     hiker_name = str.strip(hiker_name, " ")
     hiker['name'] = hiker_name
     hiker_trail_name_xpath = "/html/body/table//tr[4]/td/table/tr//td[2]/table//tr[2]/td//font[1]/b"
     hiker_trail_name = Selector(response=response).xpath(hiker_trail_name_xpath).extract()[0]
     hiker_trail_name_start = str.find(hiker_trail_name, ">", 0, len(hiker_trail_name))
     hiker_trail_name_end = str.find(hiker_trail_name, "<", hiker_trail_name_start, len(hiker_trail_name))
     hiker_trail_name = hiker_trail_name[hiker_trail_name_start + 1:hiker_trail_name_end]
     hiker['trail_name'] = hiker_trail_name
     hiker['about_url'] = response.url
     # TODO: Verify that the 'journal_url' is the FIRST journal entry.
     hiker['journal_url'] = str.replace(response.url, "about", "entry")
     journal_parse_request = Request(hiker['journal_url'], callback=self.parse_hiker_journal)
     journal_parse_request.meta['hiker'] = hiker
     yield journal_parse_request
def test_hs_middlewares_retry(hs_downloader_middleware, hs_spider_middleware):
    spider = Spider('test')
    url = 'http://resp-url'
    request_0 = Request(url)
    response_0 = Response(url)

    hs_downloader_middleware.process_request(request_0, spider)

    assert HS_REQUEST_ID_KEY not in request_0.meta
    assert HS_PARENT_ID_KEY not in request_0.meta
    assert len(hs_spider_middleware._seen_requests) == 0
    assert len(hs_downloader_middleware._seen_requests) == 0

    hs_downloader_middleware.process_response(request_0, response_0, spider)

    assert request_0.meta[HS_REQUEST_ID_KEY] == 0
    assert request_0.meta[HS_PARENT_ID_KEY] is None
    assert hs_spider_middleware._seen_requests[request_0] == 0

    request_1 = request_0.copy()
    response_1 = Response(url)
    assert request_1.meta[HS_REQUEST_ID_KEY] == 0
    assert request_1.meta[HS_PARENT_ID_KEY] is None

    hs_downloader_middleware.process_request(request_1, spider)

    assert HS_REQUEST_ID_KEY not in request_1.meta
    assert request_1.meta[HS_PARENT_ID_KEY] == 0

    hs_downloader_middleware.process_response(request_1, response_1, spider)

    assert request_1.meta[HS_REQUEST_ID_KEY] == 1
    assert request_1.meta[HS_PARENT_ID_KEY] == 0
Пример #5
0
    def parse_data(self, response):

        rows = response.selector.xpath(
            '//*[@id="container-outer"]/div[1]/div[3]/div/div/div[2]/table/tbody/tr')
        for rows in rows:

            if rows.xpath('td/p'):
                url1_temp = rows.xpath('td/p').extract()
                count = 0
                for url1_temp in url1_temp:
                    item = SpiderItem()
                    url_tem = rows.xpath('td/p/a/@href').extract()
                    item['url'] = urljoin(response.url, url_tem[count])
                    item['publishdate'] = rows.xpath('td/div/a/@title').extract()
                    time_temp = rows.xpath('td/p[' + str(count + 1) + ']/text()[2]').extract()
                    item['publishtime'] = process_string(time_temp[0].strip().split('[')[0])
                    item['Source'] = "[House Committe on Appropriations - Subcommittee on Interior and Environment]"
                    item['_type'] = "[Hearings and Markups]"
                    item['ekwhere'] = "[Fed]"
                    link = 'http://docs.house.gov/Committee/Calendar/' + url_tem[count]
                    request = Request(link, callback=self.grab_title)
                    request.meta['item'] = item
                    yield request
                    count = count + 1
                    yield item
Пример #6
0
    def parse(self, response):
        '获取商铺详情页'
        req = []
        plazaId=response.url.split('/')[-1]
        sel = Selector(response)
        tmplist=['10']
        plazaShop_list=sel.xpath('//*[@class="mod-title"]/a')
        for plazaShops in plazaShop_list:
            str=plazaShops.xpath('text()').extract()[0]
            if str=='更多店铺':
                for category in tmplist:
                    if category=='10':
                        shopsurl='http://www.dianping.com'+plazaShops.xpath('@href').extract()[0].replace('20_','10_').strip()
                        shopCatetory1='餐饮'
                    if category=='20':
                        shopsurl='http://www.dianping.com'+plazaShops.xpath('@href').extract()[0].replace('20_','20_').strip()
                        shopCatetory1='购物'
                    shopStreet=plazaShops.xpath('@href').extract()[0].split('/')[-2].replace('20_','').strip()
                    item=PlazaShop()
                    item['plazaId']=plazaId
                    item['shopStreet']=shopStreet
                    item['shopCatetory1']=shopCatetory1
                    item['shopUrl']=shopsurl
                    r = Request(shopsurl, callback=self.shop_next_page)
                    r.meta['item'] = item
                    req.append(r)

        return req
Пример #7
0
    def parse(self, response):
        """Crawl article index pages.

        From the index page, for each article extract it's topic first
        because in this old version, there is no text information about the
        topic in the article page.  On index pages, it's contained in the alt
        attribute of article/topic image, but alt is empty on the article page.

        After that, follow the "Read more" link and get the other article
        fields.
        
        """
        for i, a in enumerate(response.xpath(
                "//div[@class='articletrailer']/descendant::a[@class='trailer'][1]/@href")):
            article = Article()

            # If the image is not the default topic image, it will not have
            # an appropriate selector, so we use it's div.
            article["category"] = response.xpath(
                    "//div[@class='articleheading']/descendant::img/@alt").extract()[i]

            article_url = response.urljoin(a.extract())
            request = Request(article_url, callback=self.parse_article)
            request.meta["article"] = article

            yield request
Пример #8
0
    def parse_job_list_page(self, response):
        self.get_connector().log(self.name, self.ACTION_CRAWL_LIST, response.url)

        feed_parser = feedparser.parse(response.body)
        for job_entry in feed_parser.entries:
            job_url = job_entry.link
            job_publication_date = datetime.fromtimestamp(mktime(job_entry.published_parsed))

            job_publication_time = mktime(job_publication_date.timetuple())
            last_job_publication_time = mktime(self._last_job_date.timetuple())
            if job_publication_time <= last_job_publication_time:
                self.get_connector().log(self.name,
                                         self.ACTION_MARKER_FOUND,
                                         "%s <= %s" % (job_publication_time, last_job_publication_time))
                return

            prepared_job = JobItem()
            request = Request(job_url, self.parse_job_page)
            request.meta['item'] = prepared_job

            prepared_job['title'] = job_entry.title
            prepared_job['description'] = job_entry.description
            prepared_job['publication_datetime'] = job_publication_date

            yield request
Пример #9
0
 def parse_index(self, response):
     """处理目录页面,返回指向待爬取网页的Request列表
     """
     conf = response.meta['conf']
     requests = []
     page_list = self._get_result(response,conf)
     # 如果目录中没有内容,返回空列表
     if not page_list:
         return requests
     next_page = True  # 目录是否需要翻页
     # 逐条测试从目录中提取的网页列表
     for item in page_list:
         if isinstance(item, Request):  # 返回了新的Request
             requests.append(item)
             next_page = False
             break
         if item['publish_time']:
             if item['publish_time'] <= self.from_time:  # 网页发布时间早于self.from_time
                 next_page = False
                 break
         req = Request(item['crawl_url'], self.parse_page)
         # 传递已抽取信息
         req.meta["item"] = item
         requests.append(req)
     # 如需要翻页,添加下一页的Request;否则关闭生成器
     if next_page:
         requests.append(Request(self._next_result_page(response), callback = self.parse_index, meta = {'conf':conf}))
     return requests
Пример #10
0
    def parse_depute(self, response):
        depute = json.loads(response.body_as_unicode())
        if 'depute' in depute:
            depute = depute['depute']

        depute['photo_url'] = self.photo_url % depute['slug']

        req = None

        for ad in depute['adresses']:
            adresse = ad['adresse']

            pattern = ur'Télé(phone|copie)\s*:\s*(\d[0-9 ]+\d)'
            for telm in re.finditer(pattern, adresse):
                if telm.group(1) == 'phone':
                    ad['tel'] = telm.group(2)
                else:
                    ad['fax'] = telm.group(2)

            lad = adresse.lower()
            if not req and not lad.startswith(u'assemblée nationale'):
                trimmed = re.sub(pattern, '', adresse)
                req = Request(url=self.get_geocode_url(adresse),
                              callback=self.parse_geocode)

                req.meta['depute'] = depute
                req.meta['adresse'] = ad

        if req is not None:
            yield req
        else:
            yield depute
Пример #11
0
    def parse(self, response):
        '获取商铺详情页'
        req = []
        plazaId=response.url.split('/')[-1]
        sel = Selector(response)
        gouwu=sel.xpath('//*[@class="hot-top fn-clear"]/div')
        i=1
        for gouwushop in gouwu:
            shopsurl='http://www.dianping.com'+gouwushop.xpath('a[1]/@href').extract()[0].strip()
            shopImg=[]
            shopImg=gouwushop.xpath('a[1]/img/@src').extract()
            item=PlazaShop()
            item['plazaId']=plazaId
            if i<=4:
                item['shopCatetory1']='购物'
            else:
                item['shopCatetory1']='餐饮'
            item['shopUrl']=shopsurl
            item['image_urls']=shopImg
            r = Request(shopsurl, callback=self.shop_detail)
            r.meta['item'] = item
            i=i+1
            req.append(r)

        return req
Пример #12
0
 def start_requests(self):
     for book in self.mongo.books_collection().find():
         readers_url = u"https://www.livelib.ru/book/{}/readers/read".format(book["lib_id"])
         self.logger.info(u"Queuing page: {}".format(readers_url))
         readers_request = Request(readers_url, callback=self.parse)
         readers_request.meta["book_lib_id"] = book["lib_id"]
         yield readers_request
Пример #13
0
 def start_requests(self):
     kwargs = {
         'debug': self.settings.getbool('GIANT_DEBUG'),
         'limit': self.settings.getint('GIANT_LIMIT'),
         'opt': 'otc'
     }
     requests = []
     for stockid in OtcIdDBHandler().stock.get_ids(**kwargs):
         for mon in range(2, -1, -1):
             timestamp = datetime.utcnow() - relativedelta(months=mon)
             if mon == 0:
                 if timestamp.day == 1 and timestamp.hour <= 14:
                     continue
             URL = (
                 'http://www.gretai.org.tw/ch/stock/aftertrading/' +
                 'daily_trading_info/st43_download.php?d=%(year)d/%(mon)02d&' +
                 'stkno=%(stock)s') % {
                     'year': timestamp.year - 1911,
                     'mon': timestamp.month,
                     'stock': stockid
             }
             request = Request(
                 URL,
                 callback=self.parse,
                 dont_filter=True)
             item = OtcHisStockItem()
             item['stockid'] = stockid
             request.meta['item'] = item
             requests.append(request)
     return requests
Пример #14
0
    def parse(self, response):
        '获取商铺详情页'
        req = []

        body= response.body.decode('gbk').replace('getCategoryCallback(','')[:-1]
        s = json.loads(body)
        datas=s["data"]
        for data in datas:
            for first_list in data["s"]:
                first=first_list["n"]
                for second_list in first_list["s"]:
                    second=second_list["n"]
                    for third_list in second_list["s"]:
                        third=third_list["n"]
                        if (first.split('|')[1]!='彩票' and first.split('|')[1]!='图书' and first.split('|')[1] !='理财' and second.split('|')[1]!='汽车品牌' and second.split('|')[1]!='汽车车型' and second.split('|')[1]!='京东通信' and third.split('|')[1]!='选号码' and  third.split('|')[1]!='装宽带' and third.split('|')[1]!='中国移动' and third.split('|')[1]!='中国联通' and third.split('|')[1]!='中国电信'):
                            item = JDItem()
                            item['first']=first.split('|')[1]
                            item['second']=second.split('|')[1]
                            item['third']=third.split('|')[1]
                            cat=third.split('|')[0]
                            url='http://list.jd.com/list.html?cat='+cat.replace('-',',')
                            if cat[:4]=='list':
                                url='http://'+cat
                            r = Request(url, callback=self.parse_brand)
                            r.meta['item'] = item
                            print 'url-222------------'+url
                            req.append(r)
        return req
Пример #15
0
    def parse(self, response):
        """Parse a APS record into a HEP record.

        Attempts to parse an XML JATS full text first, if available, and falls
        back to parsing JSON if such is not available.
        """
        aps_response = json.loads(response.body_as_unicode())

        for article in aps_response['data']:
            doi = get_value(article, 'identifiers.doi', default='')

            if doi:
                request = Request(url='{}/{}'.format(self.aps_base_url, doi),
                              headers={'Accept': 'text/xml'},
                              callback=self._parse_jats,
                              errback=self._parse_json_on_failure)
                request.meta['json_article'] = article
                request.meta['original_response'] = response
                yield request

        # Pagination support. Will yield until no more "next" pages are found
        if 'Link' in response.headers:
            links = link_header.parse(response.headers['Link'])
            next = links.links_by_attr_pairs([('rel', 'next')])
            if next:
                next_url = next[0].href
                yield Request(next_url)
Пример #16
0
    def parse_job_list_page(self, response):
        """
        Pasring of job list
        """
        self.get_connector().log(self.name, self.ACTION_CRAWL_LIST, response.url)

        try:
            for jobs in self._get_from_list__jobs_lists(response):
                for job in self._get_from_list__jobs(jobs):
                    # first we check url. If the job exists, then skip crawling
                    # (it means that the page has already been crawled
                    try:
                        url = self._get_from_list__url(job)
                    except NotCrawlable:
                        break

                    if self.get_connector().job_exist(url):
                        self.get_connector().log(self.name, self.ACTION_MARKER_FOUND, url)
                        raise StopCrawlJobList()

                    request = Request(url, self.parse_job_page)
                    prefilled_job_item = self._get_prefilled_job_item(job, url)
                    request.meta['item'] = prefilled_job_item

                    if self.is_from_page_enabled():
                        yield request
                    else:
                        yield prefilled_job_item

            next_page_url = self._get_from_list__next_page(response)
            if next_page_url:
                yield Request(url=next_page_url)
        except NotFound, exc:
            self.get_connector().log(self.name, self.ACTION_CRAWL_ERROR, str(exc))
Пример #17
0
 def start_requests(self):
     kwargs = {
         'debug': self.settings.getbool('GIANT_DEBUG'),
         'limit': self.settings.getint('GIANT_LIMIT'),
         'opt': 'twse'
     }
     requests = []
     for stockid in TwseIdDBHandler().stock.get_ids(**kwargs):
         for mon in range(4, -1, -1):
             timestamp = datetime.utcnow() - relativedelta(months=mon)
             if mon == 0:
                 if timestamp.day == 1 and timestamp.hour <= 14:
                     continue
             URL = (
                 'http://www.twse.com.tw/ch/trading/exchange/' +
                 'STOCK_DAY/STOCK_DAY_print.php?genpage=genpage/' +
                 'Report%(year)d%(mon)02d/%(year)d%(mon)02d_F3_1_8_%(stock)s.php' +
                 '&type=csv') % {
                     'year': timestamp.year,
                     'mon': timestamp.month,
                     'stock': stockid
             }
             request = Request(
                 URL,
                 callback=self.parse,
                 dont_filter=True)
             item = TwseHisStockItem()
             item['stockid'] = stockid
             request.meta['item'] = item
             requests.append(request)
     return requests
Пример #18
0
    def parse(self, response):
        """
        response.body is a result of render.html call; it contains HTML processed by a browser.
        here we parse the html
        :param response:
        :return: request to detail page & request to next page if exists
        """
        page = Selector(response)
        divs = page.xpath('//div[@class="list-game-app dotline-btn nofloat"]')
        current_url = response.url

        # parse details
        count = 0
        for div in divs:
            if count >= 2:
                break
            item = AppstoreItem()
            info = div.xpath('.//div[@class="game-info  whole"]')
            detail_url = info.xpath('./h4[@class="title"]/a/@href').extract_first()
            item["url"] = detail_url
            req = Request(detail_url, callback=self.parse_detail_page)
            req.meta["item"] = item
            count += 1
            yield req

        # next page
        '''
Пример #19
0
 def parse_zs_home(self, response):
     sel = Selector(response)
     item = self.init()
     con = sel.xpath("//meta[@name='keywords']/@content").extract()
     if con:
         item['company_shortname'] = con[0].split(',')[0]
     else:
         item['company_shortname'] = ''
         
     koubei = sel.xpath("//div[@class='zd_name']/p/a/text()").extract()
     if koubei:
         item['koubei'] = koubei[0]
     else:
         item['koubei'] = ''
     
     company_id = self.rule_getcompanyid.findall(response.url)[-1]
     url = self.url_company_des % company_id 
     item["company_id"] = company_id
     
     item["service_content"] = response.url
     if len(sel.css(".zgshb_menu")) == 0:
         file("a.txt", "a").write(company_id + "\n")
         yield item
     else:
         request = Request(url, callback=self.parse_des)
         request.meta["item"] = copy.deepcopy(item)
         yield request
Пример #20
0
    def parse(self, response):
        for href in response.xpath('//div[contains(@id, "dnn_ctr430_ExbList_pnlList")]//ul//li//a/@href'):
            url = response.urljoin(href.extract())

            request = Request(url, callback=self.parse_exhibition)
            request.meta['dont_redirect'] = True
            yield request
Пример #21
0
    def parse_nextpage(self,response):
        req = []
        sel = Selector(response)
        #print(response.body)
        '下一页地址'
        next_list = sel.xpath('//*[@class="pull-right "]/text()').extract()#[0].replace('人评价','')
        next_list2 = sel.xpath('//*[@class="pull-right"]/text()').extract()

        if next_list2:
            total=next_list2[0].replace('人评价','')
        if next_list:
            total=next_list[0].replace('人评价','')
        #total=int(next_list)/15+1
        shopid=sel.xpath('/html/head/link[4]/@href').extract()[0].split('/')[-1].split('.')[0]
        print '-----dafaaaaaaaaaaaaaa:---'+str(total)
        item = response.meta['item']
        tag_list=sel.xpath('//*[@class="tag-category"]/span[1]/text()').extract()
        tag=''
        for tags in tag_list:
            if tag=='':
                tag=tags
            tag=tag+'|'+tags
        for page in range(int(total)/15+1):
            url =  'http://i.meituan.com/deal/'+str(shopid)+'/feedback/page_'+str(page+1)
            print '--------afag2:----------'+url
            ua = random.choice(self.user_agent_list)
            if ua:
                r = Request(url, callback=self.parse_comments)
                print 'useragent-333------------'+ua
                r.headers.setdefault('User-Agent', ua)
                item['url']=url.strip()
                item['tag']=tag.strip()
                r.meta['item'] = item
                req.append(r)
        return req
Пример #22
0
 def parse(self, response):
     select = Selector(response)
     if "data" in response.meta:
         isNextPage = response.meta["data"]
         pageNo = self.digitalPattern.findall(response.url)[1]
     else:
         isNextPage = "firstPage"
         pageNo = "0"
     
     
     question_id = self.questionIdPatten.findall(response.url)[0]
     question_id = question_id[1:].replace("-", "")
     
     item = TobatoItem()
     item["question_id"] = question_id
     
     # pages 取第二页
     try:
         logging.info("# pages 取第二页")
         totoal_answers = select.css(".pages").xpath(".//em/b/text()").extract()[0]
         pages = int(totoal_answers) / 20 + 1  # page从2开始计算
         
         for page in xrange(2, pages + 1):  # xrange[)
             requestUrl = self.pageUrl % (question_id, page)
             logging.info(requestUrl + "---------------------------------------")
             request = Request(url=requestUrl, callback=self.parse, priority=123456)
             request.meta["data"] = "true"
             yield request
             pass
     except Exception, e:
         print e
Пример #23
0
    def parse_booklink(self, response):
        sel = Selector(response)
        

        # Xpath choose 'The content of first <div> </div> with class="p-name"'
        sites = sel.xpath('(//div[@class="p-name"])[1]')
        req = []
        
        for site in sites:

            # This is the hyperlink to the details of the bookinfo.
            # Xpath chooses 'The @href content(hyperlink) in <a> </a>"
            books = site.xpath('a/@href').extract()

            for b in books:
                # Request pages from url, the page will show details of the book, including category info.
                # Uses encode to keep Chinese charaters from losing
                url = "http:" + b.encode('utf-8')
 
                # Store the URL in the 'request' method, callback function is parse()
                r = Request(url, callback=self.parse_category, dont_filter=True)

                # Bookid is stored as an additional data in 'request',
                r.meta['bkid']=response.meta['id']
                req.append(r)
        return req
Пример #24
0
    def parse(self, response):
        """
        """
        sel = Selector(response)
        sites = sel.xpath("//div[@class='tabs-container']//*//article//div[@class='description']")
        domain = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(response.url))
        rub = u'\u0440\u0443\u0431.'
        items = []
        for site in sites:
            item = RealtyItem()
            price = site.xpath(".//section[@class='d-1']//p[@class='price']//span/text()").extract()[0]
            price = price.replace(rub, '').replace(u' ', '')
            item['price'] = price
            item['floor'] = site.xpath(".//section[@class='d-2 params']//p[@class='row floor']//span[@class='value corporate_red']/text()").extract()[0]
            item['space'] = site.xpath(".//section[@class='d-2 params']//p[@class='row space']//span[@class='value corporate_red']/text()").extract()[0]
            item['url'] = urljoin(domain, site.xpath(".//p[@class='title-obj']/a/@href").extract()[0])
            kitchen = site.xpath(".//section[@class='d-2 params']//p[@class='row kitchen']//span[@class='value corporate_red']/text()").extract()
            if kitchen:
                item['kitchen'] = kitchen[0]
                # item['district'] = request.meta['item']
            request = Request(item['url'], callback=self.parse_page)
            request.meta['item'] = item
            yield request

            items.append(item)
Пример #25
0
 def start_requests(self):
     for rating in self.mongo.ratings_collection().find():
         if self.mongo.users_collection().find({"user_lib_id": rating["user_lib_id"]}).count() == 0:
             user_url = u"https://www.livelib.ru/reader/{0}".format(rating["user_lib_id"])
             self.logger.info(u"Queuing page: {}".format(user_url))
             user_request = Request(user_url, callback=self.parse)
             user_request.meta["user_lib_id"] = rating["user_lib_id"]
             yield user_request
Пример #26
0
 def parse_article(self, response): 
     """Parses the page, finds the comment page link, goes there"""
     comment_page = response.xpath("//a[@class='k_makeComment']/@href")
     if len(comment_page) > 0: 
         article_link = response.url 
         comment_page = "http://www.blic.rs" + comment_page.extract()[0]
         request = Request(comment_page, callback=self.parse_comment_page) 
         request.meta['article_link'] = article_link
         yield request
Пример #27
0
 def parse_node(self, response, node):
   item = MynewsItem()
   item['host'] = 'www.themalaysianinsider.com'
   item['url'] = node.xpath('link/text()').extract()[0].strip()
   item['title'] = node.xpath('title/text()').extract()[0].strip()
   item['category'] = 'news'
   request = Request(item['url'] , callback=self.parse_content)
   request.meta['item'] = item
   yield request
Пример #28
0
 def parse_node(self, response, node):
   item = MynewsItem()
   item['host'] = 'www.thestar.com.my'
   item['url'] = node.xpath('link/text()').extract()[0].strip()
   item['title'] = node.xpath('title/text()').extract()[0].strip()
   item['date'] = node.xpath('pubDate/text()').extract()[0].strip()
   request =  Request(item['url'] , callback=self.parse_content)
   request.meta['item'] = item
   return request
Пример #29
0
 def parse(self, response):
     list_of_cities = response.xpath(
         '//h5[text()="us cities"]/parent::li/ul//a[@href][text()!="more ..."]/@href'
     ).extract()
     # print list_of_cities
     for l in list_of_cities:
         r = Request(l + "search/msa", callback=self.parse_city)
         r.meta["base_url"] = l[:-1]
         yield r
Пример #30
0
 def parseProduct(self,response):
     
     item = response.meta["data"]
     sel = Selector(response)
     #产品id
     item['product_id'] = self.rule_getpid.findall(response.url)[0]
     #产品名称,去掉空白符
     product_name = sel.xpath(".//div[@class='tb-detail-hd']/h1/text()").extract()[0]
     item['product_name'] = self.rule_removeblank.sub('',product_name)
     #brandid
     item['brand_id'] = response.headers['at_brid']
     #userid==shopid
     content = sel.xpath(".//head/meta[@name='microscope-data']/@content").extract()
     if content:
         word = self.rule_getuserid.findall(content[0])
         if word:
             item['shop_id'] = word[0].split('=')[-1]
         else:
             item['shop_id'] = ''
     else:
         item['shop_id'] = ''
     #product_point    
     product_point = sel.xpath(".//div[@class='tb-detail-hd']/p/text()").extract()[0]
     item['product_point'] = self.rule_removeblank.sub('',product_point)
     
     #product_data
     product_data_temp = sel.xpath(".//ul[@id = 'J_AttrUL']/li/text()").extract()
     product_data =[]
     item['product_type'] = ''
     item['brand_name'] = ''
     for pd in product_data_temp:
         pd_temp = pd.replace(u'\xa0','')
         if self.rule_brandname.findall(pd_temp):
             item['brand_name'] = pd_temp.split(':')[-1]
         elif self.rule_producttype.findall(pd_temp):
             item['product_type'] = pd_temp.split(':')[-1]
         product_data.append(pd_temp)
     item['product_data'] = product_data
     
     item['category_id'] = ''
     item['category_name'] = ''
     item['product_judgementnum'] = ''
     item['product_searchword'] = ''
     item['product_specialjudge'] = ''
     item['url'] = response.url
     
     #group_img
     group_imgs = sel.xpath("//ul[@id='J_UlThumb']/li/a/img/@src").extract()
     for img in group_imgs:
         item['image_urls'].append('https:'+ img.replace('60x60','300x300'))
     
     request =  Request(self.url_specialjudge%(item["product_id"]),
                        callback = self.parse_specialjudge, priority=123456
                       )
     request.meta["data"] = copy.deepcopy(item)
     
     yield request
Пример #31
0
 def start_requests(self):
     yield Request(self.basic_url.format(offset = 0),self.hn_parse)
Пример #32
0
 def get_media_requests(self, item, info):
     yield Request(item['url'])
Пример #33
0
 def start_requests(self):
     yield Request("http://www.tudogostoso.com.br/", callback=self.initial_url, headers=self.headers)
Пример #34
0
 def categories_urls(self, response):
     all_recipes_link = response.xpath("//div[contains(@class, 'submenu')]/ul/li/a/@href")[0].extract()
     yield Request(response.urljoin(all_recipes_link), callback=self.all_recipes_urls, headers=self.headers)
Пример #35
0
 def start_requests(self):
     # 设置请求校园网网址
     url = "http://202.207.247.49"
     yield Request(url, self.login_parse)
Пример #36
0
 def test_callback_serialization(self):
     r = Request("http://www.example.com",
                 callback=self.spider.parse_item,
                 errback=self.spider.handle_error)
     self._assert_serializes_ok(r, spider=self.spider)
Пример #37
0
    def parse_list(self, response):
        data = response.body
        if data == '' or data == '[]':
            log.msg(format='%(request)s post fail.response is [].',
                    level=log.ERROR,
                    request=response.url)
            return
        age = response.meta['age']
        cat = response.meta['cat']
        try:
            js = json.loads(data)
        except:
            log.msg(u'图书类别[%s]-[%s]列表请求结果解析异常,非json数据.url=%s' %
                    (age['name'], cat['name'], response.url),
                    level=log.INFO)
            return
        #
        log.msg(u'图书类别[%s]-[%s]页码[%d]总数=%d,开始请求详情...' %
                (age['name'], cat['name'], response.meta['page'],
                 len(js['products'])))
        for item in js['products']:
            #
            '''
            pc = Category()
            pc['product_id'] = item['id']
            pc['category_path'] = '01.41.%s.%s.00.00' % (age['id'], cat['id'])
            pc['path_name'] = cat['name']
            yield pc

            #详情请求
            yield Request(
                url=self.info_url.replace('<?pid?>', item['id']),
                callback=self.parse_info,
                headers=self.headers,
                meta={'age': age, 'cat': cat}
            )
            '''
            #评论请求
            yield Request(url=self.review_url.replace('<?pid?>',
                                                      item['id']).replace(
                                                          '<?page?>', '1'),
                          callback=self.parse_review,
                          headers=self.headers,
                          meta={
                              'page': 1,
                              'pid': item['id']
                          })

        #下一页
        if len(js['products']) >= 200:
            page = response.meta['page'] + 1
            log.msg(u'请求类别[%s]-[%s]的第%d页' % (age['name'], cat['name'], page))
            yield Request(url=self.list_url.replace(
                '<?page?>',
                str(page)).replace('<?age?>',
                                   age['id']).replace('<?cat?>', cat['id']),
                          callback=self.parse_list,
                          headers=self.headers,
                          meta={
                              'page': page,
                              'age': age,
                              'cat': cat
                          })
Пример #38
0
 def parse(self, response):
     self.logger.debug('response.url: {}'.format(response.url))
     try:
         total = response.xpath(
             '//div[@class="category-items clearfix"]//div[@class="category-item m"]'
         )
         # le = LinkExtractor(
         #     restrict_xpaths='//div[@class="category-item m"]/div[@class="mc"]/div[@class="items"]/dl/dd/a')
         # links = le.extract_links(response)
         for t in total:
             first_category_name = t.xpath(
                 './div[@class="mt"]/h2/span/text()').extract_first()
             second = t.xpath('./div[@class="mc"]/div[@class="items"]/dl')
             for s in second:
                 second_links = s.xpath('./dt/a').extract_first()
                 second_item = re.findall(
                     r'<a href="(.*?)" target="_blank">(.*?)</a>',
                     second_links)
                 self.logger.debug('second_item: {} '.format(second_item))
                 second_category_name = second_item[0][1]
                 second_category_url = 'https:' + second_item[0][0]
                 third_links = s.xpath('./dd/a').extract()
                 for third_link in third_links:
                     third_items = re.findall(
                         r'<a href="(.*?)" target="_blank">(.*?)</a>',
                         third_link)
                     self.logger.debug(
                         'third_items: {} '.format(third_items))
                     for item in third_items:
                         if item[0].startswith('https:'):
                             item[0] = item[0].lstrip('https:')
                         if item[0].split('.')[0].split('//')[1] != 'list':
                             self.logger.debug('not list url: {}'.format(
                                 item[0]))
                             yield Request('https:' + item[0],
                                           callback=self.parse_not_list)
                         else:
                             category_item = CategoryItem()
                             category_item[
                                 'first_category_name'] = first_category_name
                             category_item[
                                 'second_category_name'] = second_category_name
                             category_item[
                                 'second_category_url'] = second_category_url
                             category_item['third_category_name'] = item[1]
                             category_item[
                                 'third_category_url'] = 'https:' + item[0]
                             category_item['id'] = item[0].split(
                                 '=')[1].split('&')[0]
                             category_item['crawl_time'] = datetime.now(
                             ).strftime("%Y-%m-%d %H:%M:%S")
                             yield category_item
                             category_info = {
                                 'first_category_name':
                                 first_category_name,
                                 'second_category_name':
                                 second_category_name,
                                 'third_category_name':
                                 item[1],
                                 'third_category_id':
                                 item[0].split('=')[1].split('&')[0],
                             }
                             yield Request(
                                 'https:' + item[0],
                                 callback=self.parse_list,
                                 meta={'category_info': category_info})
     except Exception as e:
         self.logger.debug('parse error:', e)
Пример #39
0
    def parse_product(self, response):
        """商品页获取title,price,product_id"""
        category_info = response.meta.get('category_info')
        ids = re.findall(r"venderId:(.*?),\s.*?shopId:'(.*?)'", response.text)
        if not ids:
            ids = re.findall(r"venderId:(.*?),\s.*?shopId:(.*?),",
                             response.text)
        vender_id = ids[0][0]
        shop_id = ids[0][1]

        # shop
        shopItem = ShopItem()
        shopItem['id'] = shop_id
        shopItem['venderId'] = vender_id
        shopItem['url1'] = 'http://mall.jd.com/index-%s.html' % (shop_id)
        try:
            shopItem['url2'] = 'https:' + \
                               response.xpath('//ul[@class="parameter2 p-parameter-list"]/li/a/@href').extract_first()
        except:
            shopItem['url2'] = shopItem['url1']

        # name = ''
        if shop_id == '0':
            shopItem['name'] = '京东自营'
        else:
            if response.xpath(
                    '//ul[@class="parameter2 p-parameter-list"]/li/a//text()'
            ).extract_first():
                shopItem['name'] = response.xpath(
                    '//ul[@class="parameter2 p-parameter-list"]/li/a//text()'
                ).extract_first()
                self.logger.debug('name1: {}'.format(shopItem['name']))
            elif response.xpath(
                    '//span[@class="shop-name"]//text()').extract_first():
                shopItem['name'] = response.xpath(
                    '//span[@class="shop-name"]//text()').extract_first(
                    ).strip()
                self.logger.debug('name2: {}'.format(shopItem['name']))
            elif response.xpath(
                    '//div[@class="name"]/a//text()').extract_first():
                self.logger.debug('name3 div[@class="name"]/a: {}'.format(
                    response.xpath(
                        '//div[@class="name"]/a//text()').extract_first()))
                shopItem['name'] = response.xpath(
                    '//div[@class="name"]/a//text()').extract_first().strip()
                self.logger.debug('name3: {}'.format(shopItem['name']))
            elif response.xpath(
                    '//div[@class="shopName"]/strong/span/a//text()'
            ).extract_first():
                shopItem['name'] = response.xpath(
                    '//div[@class="shopName"]/strong/span/a//text()'
                ).extract_first().strip()
                self.logger.debug('name4: {}'.format(shopItem['name']))
            elif response.xpath(
                    '//div[@class="shopName"]/strong/span/a//text()'
            ).extract_first():
                shopItem['name'] = response.xpath(
                    '//div[@class="shopName"]/strong/span/a//text()'
                ).extract_first().strip()
                self.logger.debug('name5: {}'.format(shopItem['name']))
            elif response.xpath(
                    '//div[@class="seller-infor"]/a//text()').extract_first():
                shopItem['name'] = response.xpath(
                    '//div[@class="seller-infor"]/a//text()').extract_first(
                    ).strip()
                self.logger.debug('name6: {}'.format(shopItem['name']))
            else:
                shopItem['name'] = '京东自营'
                self.logger.debug('name7: {}'.format(shopItem['name']))
        shopItem['crawl_time'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        yield shopItem

        productsItem = ProductsItem()
        productsItem['shopId'] = shop_id
        productsItem['first_category_name'] = category_info.get(
            'first_category_name')
        productsItem['second_category_name'] = category_info.get(
            'second_category_name')
        productsItem['third_category_name'] = category_info.get(
            'third_category_name')
        productsItem['third_category_id'] = category_info.get(
            'third_category_id')
        # try:
        #     # title = response.xpath('//div[@class="sku-name"]/text()').extract()[0].replace(u"\xa0", "").strip()
        #     title = ''.join(response.xpath('//div[@class="sku-name"]//text()').extract())
        #     self.logger.debug('title1: {}'.format(title))
        # except Exception as e:
        #     title = response.xpath('//div[@id="name"]/h1/text()').extract_first()
        if response.xpath('//div[@class="sku-name"]/text()').extract():
            # title = ''.join(i.split() for i in response.xpath('//div[@class="sku-name"]//text()').extract())
            # title = ''.join(response.xpath('//div[@class="sku-name"]/text()').extract_first().split())
            title = ''.join(i.strip() for i in response.xpath(
                '//div[@class="sku-name"]/text()').extract())
            self.logger.debug('title1: {}'.format(title))
        elif response.xpath('//div[@id="name"]/h1/text()').extract_first():
            title = response.xpath(
                '//div[@id="name"]/h1/text()').extract_first()
            self.logger.debug('title2: {}'.format(title))
        else:
            title = response.xpath(
                '//ul[@class="parameter2 p-parameter-list"]/li[1]/@title'
            ).extract_first()
            self.logger.debug('title3: {}'.format(title))
        productsItem['name'] = title.strip()
        product_id = response.url.split('/')[-1][:-5]
        productsItem['id'] = product_id
        productsItem['url'] = response.url

        # description
        desc = response.xpath(
            '//ul[@class="parameter2 p-parameter-list"]//text()').extract()
        productsItem['description'] = '/'.join(i.strip() for i in desc)
        # productsItem['description'] = '/'.join(desc)

        # price
        # response = requests.get(url=price_url + product_id)
        # price_response = html_from_uri(self.price_url.format(product_id=product_id))
        total_price_url = self.price_url.format(
            product_id=product_id) + '&pduid=' + str(
                random.randint(100000, 999999))
        self.logger.debug('total_price_url: {}'.format(total_price_url))
        price_response = requests.get(total_price_url)
        price_json = price_response.json()
        self.logger.debug('price_json:{}'.format(price_json))
        productsItem['reallyPrice'] = price_json[0]['p']
        productsItem['originalPrice'] = price_json[0]['m']

        # 优惠
        # res_url = self.favourable_url % (product_id, shop_id, vender_id, category.replace(',', '%2c'))
        res_url = self.favourable_url.format(
            skuId=product_id,
            shopId=shop_id,
            venderId=vender_id,
            cat=category_info.get('third_category_id').replace(',', '%2c'))
        # print(res_url)
        # response = requests.get(res_url)
        # fav_response = html_from_uri(res_url)
        fav_response = requests.get(res_url)
        fav_data = fav_response.json()
        self.logger.debug('fav_data:{}'.format(fav_data))
        if fav_data['skuCoupon']:
            desc1 = []
            for item in fav_data['skuCoupon']:
                start_time = item['beginTime']
                end_time = item['endTime']
                time_dec = item['timeDesc']
                fav_price = item['quota']
                fav_count = item['discount']
                fav_time = item['addDays']
                desc1.append(u'有效期%s至%s,满%s减%s' %
                             (start_time, end_time, fav_price, fav_count))
            productsItem['favourableDesc1'] = ';'.join(desc1)

        if fav_data['prom'] and fav_data['prom']['pickOneTag']:
            desc2 = []
            for item in fav_data['prom']['pickOneTag']:
                desc2.append(item['content'])
            productsItem['favourableDesc2'] = ';'.join(desc2)

        productsItem['crawl_time'] = datetime.now().strftime(
            "%Y-%m-%d %H:%M:%S")
        yield productsItem

        data = dict()
        data['product_id'] = product_id
        data['page'] = 0
        yield Request(url=self.comment_url.format(productId=product_id,
                                                  page=0),
                      callback=self.parse_comments,
                      meta=data)
Пример #40
0
 def start_requests(self):
     for base_url in self.base_urls:
         yield Request(url=base_url, callback=self.parse)
Пример #41
0
    def parseStockInfo(self, response):
        stock_sets = set()

        # 主要内容
        main_content = response.css(".main > div.content")
        # print(main_content)

        # 大盘编码
        index_code = response.meta['index_code']
        # print(index_code)

        stockInfoList = main_content.css(".block02 > div.tab01").xpath(
            "//table/tr[position() > 1 and position() < 52]")
        # print("股票列表:\n")
        for stockInfo in stockInfoList:
            if (len(stockInfo.xpath("td").extract()) < 3):
                continue

            # 股票代码
            stock_code = stockInfo.xpath("td[1]/a/text()").extract()
            # 股票名称
            stock_name = stockInfo.xpath("td[2]/a/text()").extract()
            # 当前最新价格
            last_trade = stockInfo.xpath("td[3]/span/text()").extract()
            # 创建时间
            create_time = CommonUtil().getCreateTime()

            # 打印输出
            print(stock_code, stock_name, index_code, last_trade, create_time)

            # 定义大盘基本信息item
            stock_info_item = StockInfoItem()
            stock_info_item['stock_code'] = stock_code
            stock_info_item['stock_name'] = stock_name
            stock_info_item['index_code'] = index_code
            stock_info_item['last_trade'] = last_trade
            stock_info_item['create_time'] = create_time

            stock_sets.add(stock_info_item)
            pass
            yield stock_info_item

        # 循环股票列表数据,提取每只股票的股东股本信息
        for stock_info in stock_sets:
            stock_code = stock_info['stock_code'][0]
            stock_name = stock_info['stock_name'][0]

            # 提取股票股东股本信息的url
            holder_url = self.holder_url % stock_code
            print("======提取股票股东股本信息holder_url:" + holder_url)
            yield Request(url=holder_url,
                          meta={
                              'stock_code': stock_code,
                              'stock_name': stock_name
                          },
                          callback=self.parsestockShareHolder)

        # 循环股票列表数据,提取股票分红信息
        for stock_info in stock_sets:
            stock_code = stock_info['stock_code'][0]
            stock_name = stock_info['stock_name'][0]

            # 提取股票分红配送记录信息
            dividend_url = self.dividend_url % stock_code
            print("======提取股票分红配送记录信息dividend_url:" + dividend_url)
            yield Request(url=dividend_url,
                          meta={
                              'stock_code': stock_code,
                              'stock_name': stock_name
                          },
                          callback=self.parseStockDividendRecord)

        # 循环股票列表数据,提取股票所属板块(标签)信息
        for stock_info in stock_sets:
            stock_code = stock_info['stock_code'][0]
            stock_name = stock_info['stock_name'][0]

            # 提取股票所属板块信息
            stock_type_url = self.stock_type_url % stock_code
            print("======提取股票所属板块信息stock_type_url:" + stock_type_url)
            yield Request(url=stock_type_url,
                          meta={
                              'stock_code': stock_code,
                              'stock_name': stock_name
                          },
                          callback=self.parseStockTypeData)

        # 循环股票列表数据,提取股票历史记录行情信息
        for stock_info in stock_sets:
            stock_code = stock_info['stock_code'][0]
            stock_name = stock_info['stock_name'][0]

            # 循环从1990到2017年
            for year in range(1990, 2018):
                # 每年四个季度
                for quarter in range(1, 5):
                    data_url = self.detail_url % (stock_code, year, quarter)
                    print("======提取股票历史记录行情信息data_url:" + data_url)
                    yield Request(url=data_url,
                                  meta={
                                      'stock_code': stock_code,
                                      'stock_name': stock_name,
                                      'year': year
                                  },
                                  callback=self.parseHistoryStockData)
Пример #42
0
 def start_requests(self):
     yield Request(url=self.start_urls[0],
                   callback=self.parse,
                   headers=self.headers)
Пример #43
0
 def start_requests(self):
     keys = ['大数据', 'hadoop', 'spark']
     for key in keys:
         url = 'http://www.neitui.me/?name=job&handle=lists&city=城市&keyword=' + key
         yield Request(url=url)
Пример #44
0
 def start_requests(self):
     yield Request('http://ssdut.dlut.edu.cn/index/bkstz.htm', self.parse)
Пример #45
0
 def start_requests(self):
     url = 'https://movie.douban.com/top250'
     yield Request(url, headers=self.headers)
Пример #46
0
 def test_utf8_body(self):
     r = Request("http://www.example.com", body=b"\xc2\xa3")
     self._assert_serializes_ok(r)
Пример #47
0
 def start_requests(self):
     yield Request(url=u"http://airquality.deq.louisiana.gov",
                   callback=self.get_global_date)
Пример #48
0
 def test_unserializable_callback2(self):
     r = Request("http://www.example.com", callback=self.spider.parse_item)
     self.assertRaises(ValueError, r.to_dict, spider=None)
Пример #49
0
 def start_requests(self):
     url = "https://hoanghamobile.com/dien-thoai-di-dong-c14.html?sort=0&p="
     page = ["1","2","3","4","5","6","7","8","9","10"]
     for x in page:
         yield Request(url+x, self.parse)
Пример #50
0
 def start_requests(self):
     yield Request(ELECT_URL + 'speltyCommonCourse.aspx',
                   dont_filter=True,
                   callback=self.tongshi_1)
Пример #51
0
 def test_basic(self):
     r = Request("http://www.example.com")
     self._assert_serializes_ok(r)
Пример #52
0
 def start_requests(self):
     for book in book_list:
         yield Request(url=BASE_URL.format(book),
                       callback=self.parse_book_info,
                       cb_kwargs=dict(short_name=book))
Пример #53
0
 def test_unserializable_callback1(self):
     r = Request("http://www.example.com", callback=lambda x: x)
     self.assertRaises(ValueError, r.to_dict, spider=self.spider)
Пример #54
0
 def initial_url(self, response):
     for href in response.xpath("//div[contains(@class, 'menu')]/nav/ul/li/a/@href"):
         href_text = href.extract()
         if href_text not in ["/videos.php", "/especiais/15-chefs-tudo-gostoso", "/categorias/sopas.php"]:
             yield Request(response.urljoin(href.extract()), callback=self.categories_urls, headers=self.headers)
Пример #55
0
 def test_mixin_private_callback_serialization(self):
     r = Request("http://www.example.com",
                 callback=self.spider._TestSpiderMixin__mixin_callback,
                 errback=self.spider.handle_error)
     self._assert_serializes_ok(r, spider=self.spider)
Пример #56
0
 def parse(self, response):
     url = "https://hoanghamobile.com"
     url_phones = response.css('a.mosaic-overlay::attr(href)').getall()
     for url_phone in url_phones:
         yield Request(url+url_phone,self.save_info)
Пример #57
0
 def start_requests(self):
     for uid in self.start_users:
         yield Request(self.user_url.format(uid=uid),
                       callback=self.parse_user)
Пример #58
0
 def parse(self, response: Response):
     for url in response.css('.sub-catalog a::attr("href")').extract():
         yield Request('https://www.dushu.com' + url, callback=self.parse_item)
Пример #59
0
 def start_requests(self):
     # First request
     yield Request(
         url=u'FILL_WITH_URL',
         callback=self.parse,
     )
Пример #60
0
 def start_requests(self):
     yield Request(url=self.start_url, callback=self.parse, dont_filter=True)