def parse(self, response): hrefs = response.xpath('//a/@href').extract() ettoday_travel = EttodayTravelSpider() r = re.compile("^http://travel\.ettoday\.net/article/\d+\.htm$") for href in hrefs: if r.match(href) and not Database.find_dup(href): yield scrapy.Request(href, ettoday_travel.parse)
def parse(self, response): hrefs = response.xpath('//a/@href').extract() letsgojp = LetsgojpSpider() r = re.compile("http://.*letsgojp\.com/archives/\d+/$") for href in hrefs: if r.match(href) and not Database.find_dup(href): yield scrapy.Request(href, letsgojp.parse)
def parse(self, response): hrefs = response.xpath('//a/@href').extract() beauties = BeautiesSpider() r = re.compile("^http://beauties\.life/\?p=\d+$") for href in hrefs: if r.match(href) and not Database.find_dup(href): yield scrapy.Request(href, beauties.parse)
def parse(self, response): hrefs = response.xpath('//a/@href').extract() ettoday = EttodaySpider() r = re.compile("^http://www\.ettoday\.net/news/\d+/\d+\.htm$") for href in hrefs: if r.match(href) and not Database.find_dup(href): yield scrapy.Request(href, callback=ettoday.parse)
def parse(self, response): hrefs = response.xpath('//a/@href').extract() qq = QqSpider() r = re.compile("^http://news\.qq\.com/a/\d+/\d+\.htm$") for href in hrefs: if r.match(href) and not Database.find_dup(href): # print href yield scrapy.Request(href, callback=qq.parse)
def parse(self, response): hrefs = response.xpath( '//div[contains(@class, "post")]/h3[contains(@class, "post-title")]/a/@href' ).extract() kkday = KKdaySpider() for href in hrefs: if not Database.find_dup(href): yield scrapy.Request(href, callback=kkday.parse)
def parse(self, response): hrefs = response.xpath( '//div[@class="main_content"]/a/@href').extract() storm = StormSpider() for _href in hrefs: if _href: href = "http://www.storm.mg" + _href if not Database.find_dup(href): yield scrapy.Request(href, callback=storm.parse)
def parse(self, response): hrefs = response.xpath('//a/@href').extract() chinatimes = ChinatimesSpider() r = re.compile("/(realtimenews|newspapers)/\d+-\d+$") for _href in hrefs: if r.match(_href): href = "http://www.chinatimes.com" + _href if not Database.find_dup(href): yield scrapy.Request(href, callback=chinatimes.parse)
def parse(self, response): hrefs = response.xpath('//a/@href').extract() juksy = JuksySpider() r = re.compile("^/archives/\d+$") for _href in hrefs: if r.match(_href): href = "https://www.juksy.com" + _href if not Database.find_dup(href): yield scrapy.Request(href, callback=juksy.parse)
def parse(self, response): hrefs = response.xpath('//a/@href').extract() setn = SetnSpider() r = re.compile("^/News\.aspx\?NewsID=\d+$") for _href in hrefs: if r.match(_href): href = "http://www.setn.com" + _href if not Database.find_dup(href): yield scrapy.Request(href, callback=setn.parse)
def parse(self, response): hrefs = response.xpath('//a/@href').extract() bomb01 = Bomb01Spider() r = re.compile("^/article/\d+$") for _href in hrefs: if r.match(_href): href = "https://www.bomb01.com" + _href if not Database.find_dup(href): yield scrapy.Request(href, bomb01.parse)
def parse(self, response): hrefs = response.xpath('//a/@href').extract() ettoday_sports = EttodaySportsSpider() r = re.compile("^/news/\d+$") for _href in hrefs: if r.match(_href): href = "http://sports.ettoday.net" + _href if not Database.find_dup(href): yield scrapy.Request(href, callback=ettoday_sports.parse)
def parse(self, response): hrefs = response.xpath('//a/@href').extract() cdnews = CdnewsSpider() r = re.compile("^docDetail\.jsp\?coluid=\d+&docid=\d+$") for _href in hrefs: if r.match(_href): href = "http://www.cdnews.com.tw/cdnews_site/" + _href if not Database.find_dup(href): yield scrapy.Request(href, callback=cdnews.parse)
def parse(self, response): hrefs = response.xpath( '//div[contains(@class, "abdominis")]/ul/li[contains(@class, "rtddt")]/a/@href' ).extract() appledaily = AppledailySpider() for _href in hrefs: href = "http://www.appledaily.com.tw" + _href if not Database.find_dup(href): yield scrapy.Request(href, callback=appledaily.parse)
def parse(self, response): hrefs = response.xpath('//a/@href').extract() udn_style = UdnStyleSpider() r = re.compile("^/style/story/\d+/\d+$") for _href in hrefs: if r.match(_href): href = "http://style.udn.com" + _href if not Database.find_dup(href): yield scrapy.Request(href, callback=udn_style.parse)
def parse(self, response): hrefs = response.xpath('//a/@href').extract() mobile01 = Mobile01Spider() r = re.compile("^newsdetail/\d+/.*$") for _href in hrefs: if r.match(_href): href = "http://www.mobile01.com/" + _href if not Database.find_dup(href): yield scrapy.Request(href, callback=mobile01.parse)
def parse(self, response): hrefs = response.xpath('//a/@href').extract() udn_health = UdnHealthSpider() r = re.compile("^/health/story/\d+/\d+$") for _href in hrefs: if r.match(_href): href = "http://health.udn.com" + _href if not Database.find_dup(href): yield scrapy.Request(href, udn_health.parse)
def parse(self, response): hrefs = response.xpath('//a/@href').extract() stheadline = StheadlineSpider() r = re.compile("^(http://std.stheadline.com/daily/)?news-content\.php\?id=\d+&target=\d+$") for _href in hrefs: if r.match(_href): href = _href if not href.startswith("http"): href = "http://std.stheadline.com/daily/" + href if not Database.find_dup(href): yield scrapy.Request(href, stheadline.parse)