def parse(self,response): konten_selektor = 'article.list-content__item' jumlah_berita = 0 for konten in response.css(konten_selektor): link_selector = 'h3.media__title a ::attr(href)' url = konten.css(link_selector).extract_first() self.total_scraped += 1 if (not isBerita(url)): jumlah_berita = jumlah_berita +1 continue url = url+'?single=1' jumlah_berita = jumlah_berita +1 yield scrapy.Request(url, callback=self.parse_artikel) if jumlah_berita> 19 : self.hal = self.hal+1 next_page = 'https://news.detik.com/indeks/'+str(self.hal)+'?date='+self.tanggal request = scrapy.Request(url=next_page) yield request else: try : rasio = self.total_scraped//self.dropped_count if rasio < 2: kirim_notif(self.name) except: pass
def parse(self, response): konten_selektor = 'div.article__list.clearfix' #menghitung jumlah berita di halaman jumlah_berita = 0 for konten in response.css(konten_selektor): #crawl on each url link_selector = 'a.article__link ::attr(href)' link = konten.css(link_selector).extract_first() + "?page=all" self.total_scraped += 1 jumlah_berita = jumlah_berita + 1 req = scrapy.Request(link, callback=self.parse_artikel) yield req print("jumlah berita =", jumlah_berita, "----halaman =", self.hal) #find next page if any. if jumlah_berita > 14: self.hal = self.hal + 1 next_page = 'https://indeks.kompas.com/?site=news&date=' + self.tanggal + '&page=' + str( self.hal) req = scrapy.Request(next_page, callback=self.parse) yield req else: try: rasio = self.total_scraped // self.dropped_count if rasio < 2: kirim_notif(self.name) except: pass print("scraping ---- Selesai Total halaman = ", self.hal) print("jumlah berita =", jumlah_berita, "----halaman =", self.hal)
def parse(self, response): konten_selektor = '.row.mb-30' #menghitung jumlah berita di halaman jumlah_berita = 0 for konten in response.css(konten_selektor): #crawl on each url link_selector = 'a ::attr(href)' link = konten.css(link_selector).extract_first() jumlah_berita = jumlah_berita + 1 self.total_scraped += 1 req = scrapy.Request(link, callback=self.parse_artikel) yield req #find next page if any. if jumlah_berita > 14: self.hal = self.hal + 1 next_page = ('https://www.bisnis.com/index/page/?c=0&d=' + self.tanggal + '&d=' + self.tanggal + '&per_page=' + str(self.hal)) req = scrapy.Request(next_page, callback=self.parse) yield req else: try: rasio = self.total_scraped // self.dropped_count if rasio < 2: kirim_notif(self.name) except: pass print("scraping ---- Selesai Total halaman = ", self.hal) print("jumlah berita =", jumlah_berita, "----halaman =", self.hal)
def parse(self,response): #print('========',self.start_urls) berita_selector ="div.txt_subkanal.txt_index h2 a::attr(href)" i = self.i #jumlah berita untuk mengecek halaman apakah masih bisa di scrape jumlah_berita = 0 for baris in response.css(berita_selector): # crawl each url in particular page url = baris.getall()[0] self.total_scrapped += 1 if (not isBerita(url)): jumlah_berita = jumlah_berita+1 continue jumlah_berita = jumlah_berita+1 req = scrapy.Request(url, callback=self.parse_artikel) #yield request yield req #go to next page if jumlah_berita>=39: np_sel = 'div.pagination section nav a::attr(href)' next_page = response.css(np_sel).getall()[-1] req = scrapy.Request(next_page,callback=self.parse) self.i = i +1 yield req else: try: rasio = self.total_scraped//self.dropped_count if rasio < 2: kirim_notif(self.name) except: pass sys.exit("scraping Republika - selesai")
def parse(self, response): konten_selektor = 'article.simple-post' #menghitung jumlah berita di halaman jumlah_berita = 0 for konten in response.css(konten_selektor): #crawl on each url link_selector = 'a ::attr(href)' link = konten.css(link_selector).extract_first() self.total_scraped += 1 if (link in self.url_seen): sys.exit() if (not isBerita(link)): jumlah_berita = jumlah_berita + 1 continue jumlah_berita = jumlah_berita + 1 req = scrapy.Request(link, callback=self.parse_artikel) self.url_seen.append(link) yield req #find next page if any. if jumlah_berita > 9: self.hal = self.hal + 1 next_page = ('https://www.antaranews.com/indeks/' + self.tanggal + '/' + str(self.hal)) req = scrapy.Request(next_page, callback=self.parse) yield req else: try: rasio = self.total_scraped // self.dropped_count if rasio < 2: kirim_notif(self.name) except: pass print("scraping ---- Selesai Total halaman = ", self.hal) print("jumlah berita =", jumlah_berita, "----halaman =", self.hal)