예제 #1
0
    def find_last_page(self) -> int:
        current_last_page = self.default_last_page_idx
        found_last_page = 0
        while found_last_page < self.default_last_page_idx:
            response1 = scrapy.Requests(get_page_url(current_last_page))
            response2 = scrapy.Requests(get_page_url(current_last_page + 1))
            if response1 == response2:
                return current_last_page
            else:
                current_last_page += 1

            if current_last_page > self.max_last_page:
                raise LookupError("Unable to find last page")
예제 #2
0
 def parse_3rd_level(self, response):
     hrefs_3 = response.xpath('//div[@class="categoryRefinementsSection"]/ul/li[a[span[@class="refinementLink"]]]/a/@href').extract()
     hrefs_3 = map(unicode.strip, hrefs_3)
     for href3 in hrefs_3:
         print href3
         url = base_url + str(href3)
         yield scrapy.Requests(url, callback=self.parse_4th_level)
    def start_requests(self):
        urls = ['insert BZA_Calendar URL here']
        return [scrapy.Requests(url=url, callback=self.parse) for url in urls]

        def parse(self, response):
            url = response.url
            title = response.css('h1::text').extract_first()
            print('URL is: {}'.format(url))
            print('Title is: {}'.format(title))
예제 #4
0
 def parse_all(self, response):
     for content in response.xpath(
             "//ul[@class='city_list_ul']//li[contains(@data-action,'''国内''')]"
     ):
         city = content.xpath("./text()")
         urls = "https://" + content.xpath(".//a/@href") + "chengjiao/"
         print(city)
         yield scrapy.Requests(url=urls,
                               callback=self.parse_city,
                               meta={"city": city},
                               dont_filter=False)
예제 #5
0
 def parse(self, response):
     links = response.xpath('//div[@class="hp-card-list"]/section[@class="hp-card"]/a[@class="hp-card__content"]/@href').extract()
     i = 1
     for link in links:
         abs_url = response.urljoin(link)
         url_next = 'div[@class="hp-card-list"]/section[@class="hp-card"]/a/div{@class="hp-card__estimates"]/div/span[@class="hp-datum__value"]/text()'
         estimate = response.xpath(url_next).extract()
         if (i<= len(links)):
             i = i + 1
             time.sleep(10)
             yield scrapy.Requests(abs_url, callback = self.parse_indetail, meta={'estimate' : estimate})
예제 #6
0
    def parse(self, response):
        data = json.loads(response.body)
        html_str = data['list']
        soup = BeautifulSoup(html_str, 'lxml')
        for link in soup.find_all('h4',
                                  {'class': 'centerLaneCardHeader agent'}):
            yield scrapy.Requests(url=response.urljoin(link),
                                  callback=self.prof_parse)

        pagenumberregex = re.compile(r"(?<=&s=)\d+")
        pagenumberregex.sub(
            response.url,
            str(int(pagenumberregex.findall(response.url)[0]) + 10))
        yield scrapy.Request(url=pagenumberregex.sub(
            response.url,
            str(int(pagenumberregex.findall(response.url)[0]) + 10)),
                             callback=self.parse)
 def start_requests(self):
     for url in urls:
         yield scrapy.Requests(url=url)
예제 #8
0
파일: demo2.py 프로젝트: Ricsk/for_scrapy
 def start_requests(self):
     urls = ['http://python23.io/ws/demo.html']
     for url in urls:
         yield scrapy.Requests(url = url, callback = self.parser)
 def start_request(self):
     for url in start_urls:
         yield scrapy.Requests(url=url, cookies=self.cookies, callback=self.parse)
예제 #10
0
 def start_requests(self):
     urls = [
         'https://www.indeed.com/jobs?as_and=Data+Scientist&as_phr=&as_any=&as_not=&as_ttl=&as_cmp=&jt=all&st=&sr=directhire&as_src=&salary=&radius=25&l=New+York%2C+New+York&fromage=last&limit=50&sort=&psf=advsrch'
     ]
     for url in urls:
         yield scrapy.Requests(url=url, callback=self.parse)
예제 #11
0
 def start_requests(self):
     start_urls= 'https://www.goodreads.com/quotes?page=1'
     yield scrapy.Requests(url=url, callback=self.parse)
    def parse(self, response):
        soup = BeautifulSoup(response.text)
        main_list = soup.find("div", {"class": "bbs-screen"})  #取得列表中的清單主體

        #依序檢查文章列表中的tag,遇到分隔線就結束,忽略這之後的文章
        for div in main_list.findChildren("div", recursive=False):
            class_name = div.attrs["class"]

            #遇到分隔線要處理的情況
            if class_name and "r-list-sep" in class_name:
                self.log("reach the last article")
                break
            #遇到目標文章
            if class_name and "r-ent" in class_name:
                div_title = div.find("div", {"class": "title"})
                a_title = div_title.find("a", href=True)
                #如果文章已被刪除則跳過
                if not a_title or not a_title.has_attr("href"):
                    continue
                article_URL = urljoin(self.host, a_title["href"])
                article_title = a_title.text
                self.log("Parse article{}".format(article_title))
                yield scrapy.Requests(url=article_URL,
                                      callback=self.parse_article,
                                      cookies=self.cookies)

        def parse_article(self, response):
            #假設網頁回應不是200 Ok,設定視為請求失敗
            if response.status != 200:
                print("error-{} is not avaliable to acsess".format(
                    response.url))
                return

            soup = BeautifulSoup(response.text)
            main_content = soup.find("main-content")  #取得文章主體內容
            #假如文章有屬性資料metas,從屬性區塊中爬出author、title、date
            metas = main_content.select("div.article-metaline")
            author = ""
            title = ""
            date = ""
            if metas:
                if meats[0].select("span.article-meta-value")[0]:
                    author = metas[0].select(
                        "span.article-meta-value")[0].string
                if metas[1].select('span.article-meta-value')[0]:
                    title = metas[1].select(
                        'span.article-meta-value')[0].string
                if metas[2].select('span.article-meta-value')[0]:
                    date = meats[2].select('span.article-meta-value')[0].string

                #從main_content中移除metas資訊
                for m in metas:
                    m.extract()
                for m in main_content.select(div.article - metaline - right):
                    m.extract()

            #取得留言區主體
            pushes = main_content.find_all("div", {"class": "push"})
            for p in pushes:
                p.extract()

            #假如文章中含「※ 發信站: 批踢踢實業坊(ptt.cc), 來自: xxx.xxx.xxx.xxx」的字樣
            #用regular expression取得IP
            try:
                ip = main_content.find(text=re.compile(u"※ 發信站:"))
                ip = re.search("[0-9]*\.[0-9]*\.[0-9]*\.[0-9]*", ip).group()
            except Exception as e:
                ip = ""

            #移除文章主體中多餘的空白、空行
            filtered = []
            for v in main_content.stripped_strings:
                #假如字首不是特殊符號或"--"都保留
                if v[0] not in [u"※", u"◆"] and v[:2] not in [u"--"]:
                    filtered.append(v)
            #定義一些特殊符號與全形符號過濾器
            expr = re.compile(u"[^一-龥。;,:“”()、?《》\s\w:/-_.?~%()]")
            for i in range(len(filtered)):
                filtered[i] = re.sub(expr, "", filtered[i])
            #移除空白字串,過濾後的文字即為文章本文
            filtered = [i for i in filtered if i]
            content = "".join(filtered)

            #處理留言板 p推蚊數量、b噓文數量、n箭頭數量
            p, b, n = 0, 0, 0
            messages = []
            for push in pushes:
                if not push.find("span", "push-tag"):
                    continue
                push_tag = push.find("span",
                                     "push-tag").string.strip("\t\n\r")  #推文還噓文
                push_userid = push.find("span", "push-userid").string.strip(
                    "\t\n\r")  #留言人是誰
                push_content = push.find("span", "push-content").strings  #留言內容
                push_content = "".join(push_content)[1:].strip("\t\n\r")
                push_ipdatetime = push.find("span",
                                            "push-ipdatetime").string.strip(
                                                "\t\n\r")  #留言日期時間

                #整理打包留言資訊,統計推噓文數量
                messages.append({
                    "push_tag": push_tag,
                    "push_userid": push_userid,
                    "push_content": push_content,
                    "push_ipdatetime": push_ipdatetime
                })
                if push_tag == u"推":
                    p += 1
                elif push_tag == u"噓":
                    b += 1
                else:
                    n += 1

            #統計推噓文 count為推噓文誰較多、all總留言數量
            message_count = {
                "all": p + b + n,
                "count": p - b,
                "push": p,
                "boo": b,
                "neutral": n
            }

            # 整理文章資訊
            data = PTTArticleItem()
            article_id = str(Path(urlparse(response.url).path).stem)
            data['url'] = response.url
            data['article_id'] = article_id
            data['article_author'] = author
            data['article_title'] = title
            data['article_date'] = date
            data['article_content'] = content
            data['ip'] = ip
            data['message_count'] = message_count
            data['messages'] = messages
            yield data
 def satrt_requests(self):
     yield scrapy.Requests(url=self.start_urls,
                           callback=self.parse,
                           cookies=self.cookies)
예제 #14
0
 def requests(self):
     yield scrapy.Requests(url=start_urls, callback=self.parse)
예제 #15
0
 def start_requests(self):
     yield scrapy.Requests(url, headers=headers, params=params,callback = self.parse)
 def start_requests(self):
     urls = [
         'https://www.crunchbase.com/app/search/companies',
     ]
     yield scrapy.Requests(urls[0])