示例#1
0
 def test_url_query_parameter_2(self):
     """
     This problem was seen several times in the feeds. Sometime affiliate URLs contains
     nested encoded affiliate URL with direct URL as parameters. For example:
     aff_url1 = 'http://www.tkqlhce.com/click-2590032-10294381?url=http%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FArgosCreateReferral%3FstoreId%3D10001%26langId%3D-1%26referrer%3DCOJUN%26params%3Dadref%253DGarden+and+DIY-%3EGarden+furniture-%3EChildren%26%2339%3Bs+garden+furniture%26referredURL%3Dhttp%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FProductDisplay%253FstoreId%253D10001%2526catalogId%253D1500001501%2526productId%253D1500357023%2526langId%253D-1'
     the typical code to extract needed URL from it is:
     aff_url2 = url_query_parameter(aff_url1, 'url')
     after this aff2_url is:
     'http://www.argos.co.uk/webapp/wcs/stores/servlet/ArgosCreateReferral?storeId=10001&langId=-1&referrer=COJUN&params=adref%3DGarden and DIY->Garden furniture->Children's gardenfurniture&referredURL=http://www.argos.co.uk/webapp/wcs/stores/servlet/ProductDisplay%3FstoreId%3D10001%26catalogId%3D1500001501%26productId%3D1500357023%26langId%3D-1'
     the direct URL extraction is
     url = url_query_parameter(aff_url2, 'referredURL')
     but this will not work, because aff_url2 contains ' (comma sign encoded in the feed)
     and the URL extraction will fail, current workaround was made in the spider,
     just a replace for ' to %27
     """
     return # FIXME: this test should pass but currently doesnt
     # correct case
     aff_url1 = "http://www.anrdoezrs.net/click-2590032-10294381?url=http%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FArgosCreateReferral%3FstoreId%3D10001%26langId%3D-1%26referrer%3DCOJUN%26params%3Dadref%253DGarden+and+DIY-%3EGarden+furniture-%3EGarden+table+and+chair+sets%26referredURL%3Dhttp%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FProductDisplay%253FstoreId%253D10001%2526catalogId%253D1500001501%2526productId%253D1500357199%2526langId%253D-1"
     aff_url2 = url_query_parameter(aff_url1, 'url')
     self.assertEqual(aff_url2, "http://www.argos.co.uk/webapp/wcs/stores/servlet/ArgosCreateReferral?storeId=10001&langId=-1&referrer=COJUN&params=adref%3DGarden and DIY->Garden furniture->Garden table and chair sets&referredURL=http://www.argos.co.uk/webapp/wcs/stores/servlet/ProductDisplay%3FstoreId%3D10001%26catalogId%3D1500001501%26productId%3D1500357199%26langId%3D-1")
     prod_url = url_query_parameter(aff_url2, 'referredURL')
     self.assertEqual(prod_url, "http://www.argos.co.uk/webapp/wcs/stores/servlet/ProductDisplay?storeId=10001&catalogId=1500001501&productId=1500357199&langId=-1")
     # weird case
     aff_url1 = "http://www.tkqlhce.com/click-2590032-10294381?url=http%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FArgosCreateReferral%3FstoreId%3D10001%26langId%3D-1%26referrer%3DCOJUN%26params%3Dadref%253DGarden+and+DIY-%3EGarden+furniture-%3EChildren%26%2339%3Bs+garden+furniture%26referredURL%3Dhttp%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FProductDisplay%253FstoreId%253D10001%2526catalogId%253D1500001501%2526productId%253D1500357023%2526langId%253D-1"
     aff_url2 = url_query_parameter(aff_url1, 'url')
     self.assertEqual(aff_url2, "http://www.argos.co.uk/webapp/wcs/stores/servlet/ArgosCreateReferral?storeId=10001&langId=-1&referrer=COJUN&params=adref%3DGarden and DIY->Garden furniture->Children's garden furniture&referredURL=http://www.argos.co.uk/webapp/wcs/stores/servlet/ProductDisplay%3FstoreId%3D10001%26catalogId%3D1500001501%26productId%3D1500357023%26langId%3D-1")
     prod_url = url_query_parameter(aff_url2, 'referredURL')
     # fails, prod_url is None now
     self.assertEqual(prod_url, "http://www.argos.co.uk/webapp/wcs/stores/servlet/ProductDisplay?storeId=10001&catalogId=1500001501&productId=1500357023&langId=-1")
示例#2
0
    def parse(self, response):
        self.log("Crawled %s %d" % (response.url, response.status), level=scrapy.log.INFO)
        # self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            return

        res_data = json.loads(response.body)
        if res_data["retcode"] != "0" and res_data["retcode"] != 0:
            self.log("Crawled %s %s" % (response.url, res_data["retcode"]), level=scrapy.log.CRITICAL)
            return
        bid = url_query_parameter(response.url, 'bid')
        start = url_query_parameter(response.url, 'start')
        min_time = time.time()
        for item in res_data["result"]["posts"]:
            url = "http://buluo.qq.com/p/detail.html?bid=%s&pid=%s" % (bid, item["pid"])
            yield self.baidu_rpc_request({"url": url, "src_id": 22})
            if int(item["time"]) < min_time:
                min_time = int(item["time"])

        start = int(start)

        if "total" in res_data["result"] and int(
                res_data["result"]["total"]) > start + 20 and time.time() - min_time < 3600 * 24 * 2:
            next_url = 'http://buluo.qq.com/cgi-bin/bar/post/get_post_by_page?bid=%s&num=20&start=%s&bkn' % (
            bid, start + 20)
            self.log("SendCrawl %s Total:%d" % (next_url, int(res_data["result"]["total"])), level=scrapy.log.INFO)
            yield scrapy.Request(url=next_url, headers={"Referer": "http://buluo.qq.com/p/barindex.html?bid=%s" % bid})
示例#3
0
 def test_url_query_parameter_2(self):
     """
     This problem was seen several times in the feeds. Sometime affiliate URLs contains
     nested encoded affiliate URL with direct URL as parameters. For example:
     aff_url1 = 'http://www.tkqlhce.com/click-2590032-10294381?url=http%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FArgosCreateReferral%3FstoreId%3D10001%26langId%3D-1%26referrer%3DCOJUN%26params%3Dadref%253DGarden+and+DIY-%3EGarden+furniture-%3EChildren%26%2339%3Bs+garden+furniture%26referredURL%3Dhttp%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FProductDisplay%253FstoreId%253D10001%2526catalogId%253D1500001501%2526productId%253D1500357023%2526langId%253D-1'
     the typical code to extract needed URL from it is:
     aff_url2 = url_query_parameter(aff_url1, 'url')
     after this aff2_url is:
     'http://www.argos.co.uk/webapp/wcs/stores/servlet/ArgosCreateReferral?storeId=10001&langId=-1&referrer=COJUN&params=adref%3DGarden and DIY->Garden furniture->Children&#39;s gardenfurniture&referredURL=http://www.argos.co.uk/webapp/wcs/stores/servlet/ProductDisplay%3FstoreId%3D10001%26catalogId%3D1500001501%26productId%3D1500357023%26langId%3D-1'
     the direct URL extraction is
     url = url_query_parameter(aff_url2, 'referredURL')
     but this will not work, because aff_url2 contains &#39; (comma sign encoded in the feed)
     and the URL extraction will fail, current workaround was made in the spider,
     just a replace for &#39; to %27
     """
     return # FIXME: this test should pass but currently doesnt
     # correct case
     aff_url1 = "http://www.anrdoezrs.net/click-2590032-10294381?url=http%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FArgosCreateReferral%3FstoreId%3D10001%26langId%3D-1%26referrer%3DCOJUN%26params%3Dadref%253DGarden+and+DIY-%3EGarden+furniture-%3EGarden+table+and+chair+sets%26referredURL%3Dhttp%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FProductDisplay%253FstoreId%253D10001%2526catalogId%253D1500001501%2526productId%253D1500357199%2526langId%253D-1"
     aff_url2 = url_query_parameter(aff_url1, 'url')
     self.assertEqual(aff_url2, "http://www.argos.co.uk/webapp/wcs/stores/servlet/ArgosCreateReferral?storeId=10001&langId=-1&referrer=COJUN&params=adref%3DGarden and DIY->Garden furniture->Garden table and chair sets&referredURL=http://www.argos.co.uk/webapp/wcs/stores/servlet/ProductDisplay%3FstoreId%3D10001%26catalogId%3D1500001501%26productId%3D1500357199%26langId%3D-1")
     prod_url = url_query_parameter(aff_url2, 'referredURL')
     self.assertEqual(prod_url, "http://www.argos.co.uk/webapp/wcs/stores/servlet/ProductDisplay?storeId=10001&catalogId=1500001501&productId=1500357199&langId=-1")
     # weird case
     aff_url1 = "http://www.tkqlhce.com/click-2590032-10294381?url=http%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FArgosCreateReferral%3FstoreId%3D10001%26langId%3D-1%26referrer%3DCOJUN%26params%3Dadref%253DGarden+and+DIY-%3EGarden+furniture-%3EChildren%26%2339%3Bs+garden+furniture%26referredURL%3Dhttp%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FProductDisplay%253FstoreId%253D10001%2526catalogId%253D1500001501%2526productId%253D1500357023%2526langId%253D-1"
     aff_url2 = url_query_parameter(aff_url1, 'url')
     self.assertEqual(aff_url2, "http://www.argos.co.uk/webapp/wcs/stores/servlet/ArgosCreateReferral?storeId=10001&langId=-1&referrer=COJUN&params=adref%3DGarden and DIY->Garden furniture->Children&#39;s garden furniture&referredURL=http://www.argos.co.uk/webapp/wcs/stores/servlet/ProductDisplay%3FstoreId%3D10001%26catalogId%3D1500001501%26productId%3D1500357023%26langId%3D-1")
     prod_url = url_query_parameter(aff_url2, 'referredURL')
     # fails, prod_url is None now
     self.assertEqual(prod_url, "http://www.argos.co.uk/webapp/wcs/stores/servlet/ProductDisplay?storeId=10001&catalogId=1500001501&productId=1500357023&langId=-1")
示例#4
0
 def test_url_query_parameter(self):
     self.assertEqual(url_query_parameter("product.html?id=200&foo=bar", "id"),
                      '200')
     self.assertEqual(url_query_parameter("product.html?id=200&foo=bar", "notthere", "mydefault"),
                      'mydefault')
     self.assertEqual(url_query_parameter("product.html?id=", "id"),
                      None)
     self.assertEqual(url_query_parameter("product.html?id=", "id", keep_blank_values=1),
                      '')
    def get_date_from_url(self, url):
        year = url_query_parameter(url, "year")
        month = url_query_parameter(url, "mon")
        day = url_query_parameter(url, "day")

        # url_date = datetime(year=int(year), month=int(month), day=int(day), tzinfo=timezone(self.tz))
        url_date = datetime(year=int(year), month=int(month), day=int(day))
        # print(url_date)
        return url_date
    def parse_product_list(self, response):
        hxs = HtmlXPathSelector(response)

        categories = hxs.select('//li[@class="PANEL ALL"]//a/@href').extract()
        categories += hxs.select(
            '//li[@class="PANEL BY-SIZE"]//a/@href').extract()
        categories += hxs.select(
            '//li[@class="PANEL BY-TYPE"]//a/@href').extract()
        for url in categories:
            url = url_query_cleaner(response.urljoin(url))
            yield Request(url, callback=self.parse_product_list)

        products = hxs.select('//div[@id="pdList"]//a/@href').extract()
        products += hxs.select(
            '//div[@class="product-tile"]//a/@href').extract()
        for url in products:
            pid = url.split('_')[-1]
            if pid not in self.parsed_products:
                self.parsed_products.append(pid)
                url = url_query_cleaner(response.urljoin(url))
                yield Request(url, callback=self.parse_product)

        product_variants = hxs.select(
            '//div[@class="productVariantTypeOptions"]/a/@href').extract()
        for url in product_variants:
            self.log('productVariantTypeOptions! {}'.format(url))
            pid = url.split('_')[-1]
            if pid not in self.parsed_products:
                self.parsed_products.append(pid)
                url = url_query_cleaner(response.urljoin(url))
                yield Request(url, callback=self.parse_product)

        next_page = None
        cur_page = url_query_parameter(response.url, 'pi', None)
        if cur_page:
            # The spider is already crawling the pages, we just assing the current url
            # so we can increment the 'pi' argument
            next_page = response.url
        else:
            # First page of the product list, we extract the pagination url with regex
            next_page = re.findall('.get\( &quot;(.*)pi=', response.body)
            if next_page:
                next_page = response.urljoin(next_page[0])

        if (next_page and products != response.meta.get('products', [])) or (
                next_page and
                product_variants != response.meta.get('product_variants', [])):
            cur_page = url_query_parameter(next_page, 'pi', '1')
            url = add_or_replace_parameter(next_page, 'pi',
                                           str(int(cur_page) + 1))
            self.log('Goes to next page: ' + url)
            yield Request(url,
                          callback=self.parse_product_list,
                          meta={
                              'products': products,
                              'product_variants': product_variants
                          })
示例#7
0
 def test_url_query_parameter(self):
     self.assertEqual(url_query_parameter("product.html?id=200&foo=bar", "id"),
                      '200')
     self.assertEqual(url_query_parameter("product.html?id=200&foo=bar", "notthere", "mydefault"),
                      'mydefault')
     self.assertEqual(url_query_parameter("product.html?id=", "id"),
                      None)
     self.assertEqual(url_query_parameter("product.html?id=", "id", keep_blank_values=1),
                      '')
示例#8
0
    def get_date_from_url(url):
        day = url_query_parameter(url, u"fecha_dia")
        month = url_query_parameter(url, u"fecha_mes")
        year = url_query_parameter(url, u"fecha_anio")

        page_date = {
            u"day": int(day),
            u"month": int(month),
            u"year": int(year),
        }

        return page_date
 def parse_comments(self, response):
     items = []
     # the first item is not a comment
     for athing_sel in response.xpath('//tr[@class="athing"]')[1:]:
         comment_item = CommentItem()
         comment_item['hacker_news_item'] = url.url_query_parameter(
             response.url, "id")
         comment_item['nesting_level'] = int(
             athing_sel.xpath(
                 ".//td[@class='ind']/img/@width").extract_first())
         comment_item['text'] = "\n".join(
             athing_sel.xpath(
                 ".//td[@class='default']//span[@class='comment']//text()").
             extract()[1:-6])
         comment_item['user_name'] = athing_sel.xpath(
             ".//td[@class='default']//span[@class='comhead']/"
             "a[starts-with(@href, 'user')]/text()").extract_first()
         comment_item['id_'] = athing_sel.xpath(
             ".//td[@class='default']//a[starts-with(@href, 'item')]/@href"
         ).extract_first()
         comment_item['hacker_news_item'] = response.url
         items.append(comment_item)
     self.fill_parents(items)
     for item in items:
         yield item
示例#10
0
    def getPrice(self, url):
        # check if its flipkart link
        if self.isHeadphonezoneLink(url):
            # remove references
            # this is done so that we save only the product url in database
            #url = self.cleanURL(url)
            price = {}
            res = requests.get(url, headers=self.headers)
            #print(res.content)
            soup = BeautifulSoup(res.content, 'html5lib')
            scripts = None
            try:
                scripts = soup.findAll('script', type='application/ld+json')
                for script in scripts:
                    script = json.loads(script.text)
                    if 'offers' in script:
                        scripts = script
                if not isinstance(scripts, dict):
                    scripts = None
            except AttributeError:
                return None
            finally:
                price['currency'], price['regular'], price[
                    'title'] = self.filterPrice(
                        scripts, url_query_parameter(url, 'variant'))

            return price
    def run_crawl_all(self, response):
        print(' --- run_crawl_all --- ')
        t = datetime.datetime.now().strftime("%Y.%m.%d-%H:%M:%S")
        next_offset = int(url_query_parameter(response.url, 'offset')) + 10

        list_parse_res = list_parse(eval(response.body.decode()))
        list_db_data = list_into_dbdata(list_parse_res,
                                        self.task['task_biz_enname'],
                                        self.task['task_biz_chname'],
                                        self.task['_id'])

        # 到头了或者出错了
        if not list_db_data:
            self.task['task_status'] = 'end_success'
            print('要出去了')
        else:
            res = mongo_instance.loads.insert_many(list_db_data)
            if self.crawled_times == 1:
                print(' 插入的第一个id是: %s' % res.inserted_ids[0])
                self.task['task_start_loadid'] = res.inserted_ids[0]
            self.crawled_times += 1
            print('还有请求呢别着急出去')

        self.task['task_updatetime'] = t
        self.task['task_endtime'] = t
        mongo_instance.tasks.find_one_and_update(
            filter={'_id': self.task['_id']}, update={'$set': self.task})

        if not 'running' in self.task['task_status']:
            return
        else:
            yield scrapy.Request(url=add_or_replace_parameter(
                response.url, 'offset', next_offset),
                                 headers=FakeLoadParams.headers,
                                 method='GET')
示例#12
0
def getAccessToken(link, user_id, password, birthyear ,  userprofile):
    code = None
    #try :
    r = requests.get(link, allow_redirects=True)
    #print(r.text.encode('utf-8'))
    # Connect to duckduckgo
    browser = mechanicalsoup.StatefulBrowser()
    browser.open(link)

    # Fill-in the search form
    form = browser.select_form()

    browser["apiKey"] = userprofile.api_key
    browser["username"] = user_id
    browser["password"] = password
    browser["password2fa"] = birthyear
    resp = browser.submit_selected()
    print(resp)
    if resp.status_code == 200 :
        #print '200'
        page = browser.get_current_page()
        #print page
        res = page.find("p", class_="error-msg")
        if res is not None:
            raise RuntimeError(res.getText())
    form1 = browser.select_form()
    browser.get_current_form().choose_submit_by_value('Accept')
    resp = browser.submit_selected();
    print(resp)
    code = url_query_parameter(resp.url, 'code')
    #print(code)
    return code
    def parse(self, response):
        print("procesing:" + response.url)

        # extract links
        featured_urls = response.css(
            'section[class="featured-articles__wrapper___1dxwZ"] a::attr(href)'
        ).extract()

        trending_urls = response.css(
            'section[class="trending-stories__wrapper___1KLqW"] a::attr(href)'
        ).extract()

        promoted_urls = response.css(
            'section[class="promoted-stories__section___zvI1g"] a::attr(href)'
        ).extract()

        topic_urls = response.css(
            'section[class="topics-river__wrapper___2CozB"] a::attr(href)'
        ).extract()

        all_urls = featured_urls + trending_urls + promoted_urls + topic_urls
        all_urls = [i for i in all_urls if '/article/' in i]
        all_urls = list(set(all_urls))

        for url in all_urls:
            yield scrapy.Request(
                url,
                meta={'deltafetch_key': url_query_parameter(url, 'id')},
                callback=self.parse_attr)
示例#14
0
    def parse_listing(self, response):
        """
        Extract product list.
        
        @url https://www.walgreens.com/store/c/eyes/ID=360457-tier3
        @returns requests 1
        """
        blob = response.css('script').re_first(
            r'__APP_INITIAL_STATE__ = (\{.+\});')
        if not blob:
            return

        data = json.loads(blob)

        if not data['searchResult'].get('productList'):
            return

        for each in data['searchResult']['productList']:
            yield response.follow(each['productInfo']['productURL'],
                                  callback=self.parse_product)

        limit = response.meta.get('limit', 24)
        offset = int(url_query_parameter(response.url, 'No', 0)) + limit

        return response.follow(add_or_replace_parameter(
            response.url, 'No', offset),
                               callback=self.parse_listing,
                               meta={
                                   'offset': offset,
                                   'limit': limit
                               })
示例#15
0
 def color_ids(self, response, pid):
     colour_urls = response.css(
         self.css.colour_link_css).extract() + [response.url]
     return [
         url_query_parameter(color_url, 'dwvar_{}_color'.format(pid))
         for color_url in colour_urls
     ]
示例#16
0
 def parse_category(self, response):
     category = response.css('li.last::text').extract()
     products = response.xpath('//div[@typeof="Product"]')
     for product in products:
         loader = ProductLoader(Product(), selector=product)
         loader.add_xpath('identifier', './/*[@property="url"]/@sku')
         url = product.xpath('.//*[@property="url"]/@href').extract_first()
         loader.add_value('url', response.urljoin(url))
         loader.add_xpath('name', './/*[@property="url"]/text()')
         loader.add_xpath('price', './/*[@property="price"]/text()')
         loader.add_xpath('sku', './/*[@property="url"]/@sku')
         loader.add_xpath('category', '//li[@typeof="v:Breadcrumb"]/a/text()')
         loader.add_value('category', category)
         loader.add_xpath('image_url', './/*[@property="image"]/@content')
         if loader.get_output_value('price') < 50:
             loader.add_value('shipping_cost', '9.95')
         if product.xpath('.//button[starts-with(@id, "outOfStock")]'):
             loader.add_value('stock', 0)
         yield loader.load_item()
         
     if url_query_parameter(response.url, 'pn') or re.search('/cat_.+/.', response.url):
         return
     filters = response.css('ul.filters input::attr(id)').re('^\S{5}$')
     for filt in filters:
         url = response.url + '/' + filt
         yield Request(url, self.parse_category)
示例#17
0
    def parse(self, response):
        print("procesing:" + response.url)

        # extract links
        top_links = response.css('.mainStory::attr(href)').extract()

        moretop_links = response.css(
            'div[class="assetBody riverPost"] > a::attr(href)').extract()

        subtop_links = response.css(
            'a[class="related content_article"]::attr(href)').extract()

        righttop_links = response.css(
            'div[class="assetBody dekRight riverPost"] > a::attr(href)'
        ).extract()

        latest_links = response.css('h3 > .assetHed::attr(href)').extract()

        all_links = top_links + moretop_links + subtop_links + righttop_links + latest_links
        all_links = [i for i in all_links if '/news' in i]
        all_links = list(set(all_links))

        for link in all_links:
            url = self.base_url + link
            yield scrapy.Request(
                url,
                meta={'deltafetch_key': url_query_parameter(url, 'id')},
                callback=self.parse_attr)
示例#18
0
    def parse(self, response):
        print("procesing:" + response.url)

        # extract links
        col1_links = response.css(
            'div[class="column_one "] > ul > li > div > a::attr(href)'
        ).extract()

        col2_links = response.css(
            'div[class="column_two"] > ul > li > div > a::attr(href)').extract(
            )

        other_links = response.css(
            'div[class="flex-feature | container container_column--desktop "] > a::attr(href)'
        ).extract()

        box_links = response.css(
            'div[class="envelope-container"] a::attr(href)').extract()

        all_links = col1_links + col2_links + other_links + box_links
        all_links = list(set(all_links))
        all_links = [i for i in all_links if '/story' in i]

        for link in all_links:
            url = self.base_url + link
            yield scrapy.Request(
                url,
                meta={'deltafetch_key': url_query_parameter(url, 'id')},
                callback=self.parse_attr)
示例#19
0
    def parse(self, response):
        print("procesing:" + response.url)

        # extract links
        top_link = response.xpath(
            "//a[@class = 'gs-c-promo-heading gs-o-faux-block-link__overlay-link gel-paragon-bold gs-u-mt+ nw-o-link-split__anchor']/@href"
        ).extract()

        other_links = response.css(
            'a[class="gs-c-promo-heading gs-o-faux-block-link__overlay-link gel-pica-bold nw-o-link-split__anchor"]::attr(href)'
        ).extract()

        list_links = response.css(
            'a[class="qa-heading-link lx-stream-post__header-link"]::attr(href)'
        ).extract()

        # create urls
        all_links = top_link + other_links + list_links
        all_links = [i for i in all_links if '/news/' in i and '/av/' not in i]
        all_links = list(set(all_links))

        for link in all_links:
            url = self.base_url + link
            yield scrapy.Request(
                url,
                meta={'deltafetch_key': url_query_parameter(url, 'id')},
                callback=self.parse_attr)
示例#20
0
    def parse(self, response):
        print("procesing:" + response.url)

        # extract links
        top_links = response.css(
            'div[class="sc-dnqmqq euQtQS pages__Row-r8po92-0 pages__TopContentRow-r8po92-1 fHFVhw"] a::attr(href)'
        ).extract()
        top_links = [
            i for i in top_links if len(i) > 20 and '/authors' not in i
        ]

        other_links = response.css(
            'div[class="sc-dnqmqq euQtQS pages__Row-r8po92-0 pages__CenterContentRow-r8po92-3 iZXSgS"] a::attr(href)'
        ).extract()
        other_links = [i for i in other_links if '/authors' not in i]

        all_links = top_links + other_links
        all_links = list(set(all_links))

        for link in all_links:
            url = self.base_url + link
            yield scrapy.Request(
                url,
                meta={'deltafetch_key': url_query_parameter(url, 'id')},
                callback=self.parse_attr)
示例#21
0
    def parse(self, response):
        print("procesing:"+response.url)

        # extract links
        all_links = response.css('.card__link::attr(href)').extract()

        for link in all_links:
            url = self.base_url + link
            yield scrapy.Request(url, meta={'deltafetch_key': url_query_parameter(url, 'id')}, callback=self.parse_attr)
示例#22
0
    def parse(self, response):
        print("procesing:"+response.url)

        # extract links
        all_urls = response.css('h3 > a::attr(href)').extract()
        all_urls = list(set(all_urls))

        for url in all_urls:
            yield scrapy.Request(url, meta={'deltafetch_key': url_query_parameter(url, 'id')}, callback=self.parse_attr)
    def extract_links(self, response):
        page_no = url_query_parameter(response.url, 'page_no', None)
        if not response.css('.Result a'):
            return []

        return [
            Link(url=add_or_replace_parameter(response.url, 'page_no',
                                              int(page_no) + 1))
        ]
示例#24
0
    def parse_curr(self, resp):
        # print(resp.text)
        curr_val = resp.xpath(
            '/html/body/div/div/div/h2/strong/text()').extract()
        # print(curr_val)
        if curr_val:
            # print(curr_val)
            curr_val = "".join(curr_val)
            curr_val = "".join(curr_val.split())
            try:
                curr_val = float(curr_val)
            except Exception:
                curr_val = None
        else:
            curr_val = None

        # print(curr_val)
        if curr_val:

            from_cur = url_query_parameter(resp.url, "primary")
            to_cur = url_query_parameter(resp.url, "secondary")

            corr_id = self.conn.get_corr_id(from_cur.upper(), to_cur.upper())
            # print(corr_id)
            if corr_id:
                item = TestSpiderItem()
                item["value"] = curr_val
                item["corr_id"] = corr_id
                yield item

        base_urls = self.conn.get_urls()
        for url in base_urls:
            yield SplashRequest(
                url=url,
                # url="https://ru.cryptonator.com/rates/convert/?amount=1&primary=btc&secondary=ltc&source=liverates",
                # callback=self.get_coocie,
                callback=self.parse_curr,
                endpoint="execute",
                cache_args=["lua_source"],
                args={
                    "lua_source": script,
                },
                meta={'dont_redirect': True},
                dont_filter=True)
示例#25
0
    def parse(self, response):
        print("procesing:"+response.url)

        # extract links
        all_urls = response.css('div[class="PageBuilder-col-9 PageBuilder-col"] a::attr(href)').extract()
        all_urls = [i for i in all_urls if '/2020/' in i]
        all_urls = list(set(all_urls))

        for url in all_urls:
            yield scrapy.Request(url, meta={'deltafetch_key': url_query_parameter(url, 'id')}, callback=self.parse_attr)
示例#26
0
 def get_search_results_requests(self, response):
     sel = scrapy.Selector(response)
     for url in sel.css('h3.r a::attr(href)').extract():
         if 'google.com/url' in url:
             url = url_query_parameter(url, 'url')
             referer = urljoin(response.url, url)
         else:
             referer = 'https://www.google.com/url?url=%s' % url
         headers = {'Referer': referer}
         yield scrapy.Request(url, headers=headers)
def get_page(response):
    from_page = response.meta.get('from_page', None)

    if from_page:
        page = from_page + 1
    else:
        page = url_query_parameter(response.url, 'p', None)
        if page:
            page = str_to_int(page)

    return page
示例#28
0
 def parse(self, response):
     div = response.xpath(
         "/html/body/div[9]/div[2]/section/div[1]/ul//div[@class='small-card search-Result-Card col-lg-6 col-md-6 col-sm-12 col-xs-12']"
     )
     urls = div.xpath("//div/div/div[1]/div[2]/a[@href]").get().split(
         "/")[3].split('-')[0],
     for code in urls:
         yield scrapy.Request(
             code,
             meta={'deltafetch_key': url_query_parameter(code, 'code')},
             callback=self.parse_element)
    def parse(self, response):
        print("procesing:"+response.url)

        # extract links
        all_links = response.css('.sdc-site-tile__headline > a::attr(href)').extract()
        all_links = [i for i in all_links if '/video' not in i]
        all_links = list(set(all_links))

        for link in all_links:
            url = self.base_url + link
            yield scrapy.Request(url, meta={'deltafetch_key': url_query_parameter(url, 'id')}, callback=self.parse_attr)
示例#30
0
    def parse(self, response):
        print("procesing:"+response.url)

        # extract links
        all_links = response.css('h2[class="title"] > a::attr(href), h4[class="title"] > a::attr(href)').extract()
        all_links = [i for i in all_links if '/science' in i and 'pictures' not in i]
        all_links = list(set(all_links))

        for link in all_links:
            url = self.base_url + link
            yield scrapy.Request(url, meta={'deltafetch_key': url_query_parameter(url, 'id')}, callback=self.parse_attr)
示例#31
0
    def get_pollutant_name(url):
        code_name = url_query_parameter(url, u"contaminante")
        name = None
        if code_name == u"1":
            name = u"co"
        elif code_name == u"2":
            name = u"no2"
        elif code_name == u"3":
            name = u"pm10"

        return name
示例#32
0
 def parse_options(self, response):
     data = json.loads(response.body)
     identifier = url_query_parameter(response.url, 'productId')
     sku = url_query_parameter(response.url, 'skuId')
     loader = ProductLoader(Product(), response=response)
     loader.add_value(None, response.meta['item'])
     loader.replace_value('identifier', '.'.join((identifier, sku)))
     loader.replace_value('sku', sku)
     loader.replace_value('name', data['skuName'])
     if not data['skuName'].endswith(
             data['size']) and not data['skuName'].endswith(
                 data['size'].replace(' ', '')):
         loader.add_value('name', data['size'])
     loader.replace_value('image_url',
                          response.urljoin(data['thumbnail_url']))
     loader.replace_value('price', str(data['unit_sale_price']))
     loader.replace_value('stock', data['stock'])
     if Decimal(data['unit_sale_price']) < 20:
         loader.add_value('shipping_cost', '2.99')
     yield loader.load_item()
示例#33
0
 def get_search_results_requests(self, response):
     sel = scrapy.Selector(response)
     for url in sel.css('h3.r a::attr(href)').extract():
         if 'google.com/url' in url:
             url = url_query_parameter(url, 'url')
             referer = urljoin(response.url, url)
         else:
             referer = 'https://www.google.com/url?url=%s' % url
         headers = {
             'Referer': referer
         }
         yield scrapy.Request(url, headers=headers)
示例#34
0
 def parse(self, response):
     data = json.loads(response.text)
     usernames = [user['components']['username']['val'] for user in data['completions']]
     for username in usernames:
         yield scrapy.Request(
             url='https://keybase.io/{}'.format(username),
             callback=self.parse_profile,
         )
     if len(usernames) == 100:
         q = url_query_parameter(response.url, 'q')
         for c in ascii_lowercase:
             yield scrapy.Request(url=SEARCH_URL.format(q + c))
示例#35
0
    def parse_hotel(self, response):
        hxs = Selector(response)
        hotel = HtmlParser.extract_hotel(response.url, hxs)

        checkin = url_query_parameter(response.url,"checkin")
        checkout = url_query_parameter(response.url,"checkout")

        checkinDatetime = None
        checkoutDatetime = None

        today = datetime.date.today()

        if checkin is not None:
            checkinDatetime = datetime.datetime.strptime(checkin, "%Y-%m-%d").date()
            checkinDatetime = self.add_months(checkinDatetime,1)
        else:
            checkinDatetime = datetime.date(today.year, today.month, 15)

        if checkout is not None:
            checkoutDatetime = datetime.datetime.strptime(checkout, "%Y-%m-%d").date()
            checkoutDatetime = self.add_months(checkoutDatetime,1)
        else:
            checkoutDatetime = datetime.date(today.year, today.month, 16)

        maxDatetime = self.add_months(today,18)

        if checkinDatetime < maxDatetime:
            url = url_query_cleaner(response.url)
            url = add_or_replace_parameter(url,"checkin",str(checkinDatetime))
            url = add_or_replace_parameter(url,"checkout",str(checkoutDatetime))
            #logging.warning('----------------------------  %s' % url)
            yield Request(url, callback=self.parse_hotel)

        yield hotel["hotel"]

        if len(hotel["rooms"]) > 0:
            for room in hotel["rooms"]:
                yield room
示例#36
0
 def parse_term(self, response):
   '''
     @url https://api.oyez.org/cases?filter=term:2014&page=0
     @returns requests 31 31
     @returns items 0 0
   '''
   cases = json.loads(response.body)
   results = []
   for case in cases:
     url = case['href']
     results.append(scrapy.Request(url=url, callback=self.parse_case))
   if len(cases)>=30:
     page = int(url_query_parameter(response.url, 'page')) + 1
     results.append(scrapy.Request(url=self.term_url(page), callback=self.parse_term))
   return results
示例#37
0
文件: bvg.py 项目: sbry/scrapy-berlin
 def parse(self, content):
     for href in content.css('.moment-table__more > a:nth-child(1)::attr("href")'):
         parts = {}
         source_url = content.urljoin(href.extract())
         parts['source_url'] = source_url
         parts['source_id'] = url_query_parameter(source_url, 'id')
         parts['source_name'] = self.name
         # <div class="">24.01.2016 um 17:00 Uhr • Peterpower</div>
         pub_date = content.css('.moment-table__datetime::text').extract_first().partition(' Uhr')[0]
         parts['time'] = datetime.strptime(pub_date , "%d.%m.%Y um %H:%M")
         # here
         request = scrapy.Request(source_url, callback = self.parse_item_page)
         request.meta['parts'] = parts
         yield request
     ##
     # and open the next page
     if os.getenv('SCRAPY_BVG_RECURSIVE', False):
         next_page = content.css('a.paging__control--next::attr("href")').extract_first()
         logging.log(logging.INFO, "Recursing into next page %s" % next_page)
         yield scrapy.Request(next_page, callback = self.parse)
     pass
示例#38
0
	def movie_page(self, response):

		hxs = Selector(response)
		item = ThreedmmcomItem()
		item['thread_url'] = response.url
		item['thread_id'] = url_query_parameter(response.url, 't')
		item['name'] = firstOrNone(hxs.select('//div[@class="bigusername"]/text()'))
		files = item['files'] = hxs.select('//a[@data-location]/@data-location').extract()
		if not files:
			return

		meta=item['meta']={}

		metadivs=response.xpath('//table[starts-with(@id,"post")]//td[@class="alt1" and @width="125"]/div')
		for i,entry in enumerate(metadivs):
			name=firstOrNone(entry.css('div.smallfont').xpath('text()'))
			if name and i<len(metadivs)-2:
				meta[name]=firstOrNone(metadivs[i+1].xpath('./descendant-or-self::*[name() != "script" and name() != "style"]/text()[normalize-space()]'))
		version=response.xpath('//table[starts-with(@id,"post")]//td[@class="alt1" and @width="125"]/table/tr[1]//strong/text()').extract()
		if version:
			item['meta']['version']=version[0].strip()

		return item