def get_news(self, response): l = ItemLoader(item=SpiderItem(), response=response) l.add_value( 'title', response.xpath('//div[@id="contentwrap"]/h1/text()').extract()) l.add_value('date', response.xpath('//div[@class="infos"]/p/text()').extract()) r1 = r"\d{4}\-\d{1,2}\-\d{1,2}\s\d{2}\:\d{2}\:\d{2}" date0 = re.compile(r1) date = ''.join(l.get_collected_values('date')) date1 = date0.findall(date) l.replace_value('date', date1[0]) l.add_value('content', response.xpath('//div[@class="content"]/text()').extract()) l.add_value( 'content', response.xpath('//div[@class="description"]/text()').extract()) l.add_value( 'content', response.xpath('//div[@class="content"]/p/text()').extract()) l.add_value( 'content', response.xpath('//div[@class="content"]/div/p/text()').extract()) l.add_value('url', response.url) l.add_value('collection_name', self.name) return l.load_item()
def get_news(self,response): try: l = ItemLoader(item=SpiderItem(),response=response) l.add_value('title', response.xpath('//span[@id="thread_subject"]/text()').extract()) l.add_value('date',response.xpath('//div[@class="authi"]/em/text()').extract()) r1 = r"\d{4}\-\d{1,2}\-\d{1,2}\s\d{2}\:\d{2}" date0 = re.compile(r1) date = ''.join(l.get_collected_values('date')) date1 = date0.findall(date) l.replace_value('date', date1[0]) l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/div/text()').extract()) l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/br/text()').extract()) l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/text()').extract()) l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/div/font/font/text()').extract()) l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/font/font/text()').extract()) l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/font/font/font/text()').extract()) l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/p/font/font/text()').extract()) l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/div/div/div/font/font/strong/text()').extract()) l.add_value('url', response.url) l.add_value('collection_name', self.name) return l.load_item() except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) l = ItemLoader(item=SpiderItem(), response=response) l.add_value('title', '') l.add_value('date', '1970-01-01 00:00:00') l.add_value('source', '') l.add_value('content', '') l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) return l.load_item()
def get_news(self, response): l = ItemLoader(item=SpiderItem(), response=response) l.add_value('title', response.xpath('//table/tr[3]/td[2]/text()').extract()) l.add_value('date', response.xpath('//table/tr[4]/td/text()').extract()) r1 = r"\d{4}\-\d{1,2}\-\d{1,2}" date0 = re.compile(r1) date = ''.join(l.get_collected_values('date')) date1 = date0.findall(date) l.replace_value('date', date1[0]) l.add_value( 'content', response.xpath('//td[@class="tdbg"]/div/font/text()').extract()) l.add_value( 'content', response.xpath('//td[@class="tdbg"]/p/font/text()').extract()) l.add_value( 'content', response.xpath('//td[@class="tdbg"]/p/span/text()').extract()) l.add_value('content', response.xpath('//td[@class="tdbg"]/p/text()').extract()) l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) return l.load_item()
def get_news(self,response): try: l = ItemLoader(item=SpiderItem(),response=response) l.add_value('title', response.xpath('//div[@id="biaoti"]/text()').extract()) l.add_value('title', response.xpath('//h1[@id="biaoti"]/text()').extract()) l.add_value('date',response.xpath('//span[@id="pubtime_baidu"]/text()').extract()) l.add_value('date',response.xpath('//div[@class="center lh32 grey12a"]/text()').extract()) l.add_value('date',response.xpath('//div[@id="left"]/h2/text()').extract()) l.add_value('content',response.xpath('//div[@id="zw"]/p/text()').extract()) l.add_value('content',response.xpath('//div[@id="zw"]/strong/p/text()').extract()) l.add_value('url', response.url) l.add_value('collection_name', self.name) url = response.url if url[11:17]=="shzfzz": date = ''.join(l.get_collected_values('date')) date = time.strptime(date.split()[0], u'%Y年%m月%d日') l.replace_value('date', time.strftime('%Y-%m-%d %H:%M:%S', date)) return l.load_item() except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) l = ItemLoader(item=SpiderItem(), response=response) l.add_value('title', '') l.add_value('date', '1970-01-01 00:00:00') l.add_value('source', '') l.add_value('content', '') l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) return l.load_item()
def parse_detail(self, response): t_name = response.xpath("//h1/a/text()").extract_first() for post in response.xpath("//li[@class='li_capsul_entry']"): l = ItemLoader(item=UludagtutorialItem(), selector=post) l.add_value("title", response.meta.get('title_name', t_name)) l.add_xpath( "comment", ".//div[@class='entry-p']/text() | .//div[@class='entry-p']/a/text()" ) l.add_xpath( "user", ".//div[@class='entry-secenekleri']/a[@class='alt-u yazar']/text()" ) l.add_xpath("date", ".//span[@class='date-u']/a/text()") l.add_xpath( "url", "substring-after(.//div[@class='voting_nw']/a/@href, '//')") yield scrapy.FormRequest( "https://www.uludagsozluk.com/ax/?a=yenit&ne=ben&nw=pop", formdata={"benu": l.get_collected_values('user')[0]}, method='POST', callback=self.parse_post_detail, dont_filter=True, meta={'l': l}) next_page_url = response.xpath( "//a[@class='nextpage']/@href").extract_first() if next_page_url is not None: yield scrapy.Request("https://www.uludagsozluk.com" + next_page_url, callback=self.parse_detail, dont_filter=True)
def get_news(self, response): loader = ItemLoader(item=SpiderItem(), response=response) try: loader.add_value("title", response.xpath('//h1[@id="title"]/text()').extract_first()) loader.add_value("title", response.xpath('//span[@id="title"]/text()').extract_first()) loader.add_value("date", response.xpath('//span[@class="time"]/text()').extract_first()) loader.add_value("date", response.xpath('//span[@id="pubtime"]/text()').extract_first()) date = ''.join(loader.get_collected_values("date")).strip() date = time.strptime(date, '%Y年%m月%d日 %H:%M:%S') loader.replace_value("date", time.strftime("%Y-%m-%d %H:%M:%S", date)) loader.add_value("content", ''.join(response.xpath('//div[@id="content"]/descendant-or-self::text()').extract())) loader.add_value("content", ''.join(response.xpath('//div[@class="article"]/descendant-or-self::text()').extract())) except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) loader.add_value('title', '') loader.add_value('date', '1970-01-01 00:00:00') loader.add_value('content', '') loader.add_value('url', response.url) loader.add_value('collection_name', self.name) loader.add_value('website', self.website) return loader.load_item()
def get_news(self,response): try: l = ItemLoader(item=SpiderItem(),response=response) l.add_value('title', response.xpath('//div[@class="l_tit"]/text()').extract()) l.add_value('date',response.xpath('//span[@id="pubtime_baidu"]/text()').extract()) date = ''.join(l.get_collected_values('date')) #date = time.strptime(date.split()[0], '%Y-%m-%d') #l.replace_value('date', time.strftime('%Y-%m-%d %H:%M:%S', date)) l.add_value('content',response.xpath('//div[@class="TRS_Editor"]/p/text()').extract()) l.add_value('content',response.xpath('//div[@class="TRS_Editor"]/p/strong/text()').extract()) l.add_value('content',response.xpath('//div[@class="article"]/div/p/text()').extract()) l.add_value('url', response.url) l.add_value('collection_name', self.name) return l.load_item() except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) l = ItemLoader(item=SpiderItem(), response=response) l.add_value('title', '') l.add_value('date', '1970-01-01 00:00:00') l.add_value('source', '') l.add_value('content', '') l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) return l.load_item()
def parse_item(self, response): l = ItemLoader(item=PropertiesItem(), response=response) l.add_xpath('author', '//*[@id="main-content"]/div[1]/span[2]/text()') l.add_xpath('title', '//*[@id="main-content"]/div[3]/span[2]/text()') l.add_xpath('datetime', '//*[@id="main-content"]/div[4]/span[2]/text()') l.add_xpath('board', '//*[@id="main-content"]/div[2]/span[2]/text()') l.add_xpath('category', '//*[@id="main-content"]/div[3]/span[2]/text()', re='^\[.+\]') if len(l.get_collected_values('category')) == 0: l.add_xpath('category', '//*[@id="main-content"]/div[3]/span[2]/text()', re='^Re') # Housekeeping fields l.add_value('url', response.url) l.add_value('project', self.settings.get('BOT_NAME')) l.add_value('spider', self.name) l.add_value('server', socket.gethostname()) l.add_value('rtrv_date', datetime.datetime.now()) return l.load_item()
def parse_article(self, response): loader = ItemLoader(item=XfjyArticleItem(), response=response) article_url = response.url title = response.meta["title"] # date = response.meta["date"] tags_list = response.meta["tags_list"] block_type = ",".join(tags_list) # 暂时attachments放在这里 attchments = response.xpath( "//div[@class='main_nei_you_baio_content']//span//a") names_urls = [(attchment.xpath(".//span//text()").extract_first(), attchment.xpath(".//@href").extract_first()) for attchment in attchments] name_url = {name: response.urljoin(url) for name, url in names_urls} attchments = json.dumps(name_url, ensure_ascii=False) index = response.meta.get("index") loader.add_value("article_url", article_url) loader.add_value("title", title) loader.add_value("tags_list", tags_list) loader.add_value("block_type", block_type) loader.add_value("attch_name_url", attchments) loader.add_xpath( "author", "//div[@class='main_nei_you_baio_content']//span[@class='authorstyle44003']//text()" ) loader.add_value( "content", response.xpath( "//div[@class='main_nei_you_baio_content']//td[@class='contentstyle44003']" )) loader.add_xpath( "img", "//div[@class='main_nei_you_baio_content']//td[@class='contentstyle44003']//@src" ) loader.add_xpath( "detail_time", "//div[@class='main_nei_you_baio_content']//span[@class='timestyle44003']//text()" ) loader.add_value("index", index) imgs = loader.get_collected_values("img") if imgs: for img in imgs: if "http" in img: yield Request(img, callback=self.parse_img, dont_filter=True, meta={ "type": "image", "article_url": response.url }) yield loader.load_item()
def get_news(self, response): try: loader = ItemLoader(item=SpiderItem(), response=response) loader.add_value( 'title', response.xpath( '//div[@class="left"]/h1/text()').extract_first()) loader.add_value( 'title', response.xpath('//h1[@class="h1"]/text()').extract_first()) loader.add_value( 'date', response.xpath('//div[@class="zuoze"]/text()').extract_first()) loader.add_value( 'date', response.xpath( '//span[@class="post-time"]/text()').extract_first()) date = ''.join(loader.get_collected_values('date')) if date == '': return loader.replace_value('date', date.strip() + ":00") loader.add_value( 'content', ''.join( response.xpath( '//span[@id="zoom"]/descendant-or-self::text()'). extract())) loader.add_value( 'content', ''.join( response.xpath( '//p[@class="summary"]/descendant-or-self::text()'). extract())) loader.add_value('url', response.url) loader.add_value('collection_name', self.name) loader.add_value('website', self.website) yield loader.load_item() except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) l = ItemLoader(item=SpiderItem(), response=response) l.add_value('title', '') l.add_value('date', '1970-01-01 00:00:00') l.add_value('source', '') l.add_value('content', '') l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) yield l.load_item()
def parse(self, response): load_data = ItemLoader(item=GoogleSearchBlockData(), response=response) load_data.add_xpath('name', '//div[@class="BNeawe vvjwJb AP7Wnd"]/text()') load_data.add_xpath('url', '//div[@class="kCrYT"]//a/@href') load_data.add_xpath('next_site', '//a[@class="nBDE1b G5eFlf"]/@href') next_page = load_data.get_collected_values('next_site') try: next_page = next_page[next_page.__len__()-1] yield load_data.load_item() yield scrapy.Request(next_page, callback=self.parse) except IndexError: self.log('\n\n Moj LOGGER: \n'+ 'Zakończono pobieranie'+'\n\n')
def parse_article_ajax(self, response): try: article = json.loads(response.body)["data"]["article"] except BaseException as e: self.log(response.body.decode(), level=logging.DEBUG) self.log("解析json过程出现错误,没有article,链接:{},错误:{}".format( response.url, str(e)), level=logging.ERROR) self.log("{}".format(str(response.request.body)), level=logging.ERROR) else: loader = ItemLoader(item=YibanArticleItem(), response=response) article_url = response.meta.get("article_url") title = response.meta.get("title") tags_list = [ "易班", ] tags_list.append(article.get("Sections_name")) block_type = ",".join(tags_list) content = article.get("content") detail_time = article.get("createTime") # 易班的网站上面没有附件 attchments = "" index = response.meta.get("index") loader.add_value("article_url", article_url) loader.add_value("title", title) loader.add_value("tags_list", tags_list) loader.add_value("block_type", block_type) loader.add_value("content", content) content_response = Selector(text=content) loader.add_value("img", content_response.xpath("//img//@src").extract()) loader.add_value("detail_time", detail_time) loader.add_value("index", index) imgs = loader.get_collected_values("img") if imgs: for img in imgs: if "http" in img: yield Request(img, callback=self.parse_img, dont_filter=True, meta={ "type": "image", "article_url": article_url }) yield loader.load_item()
def parse(self, response): base_post = response.css('article.post_preview') for post in base_post: item = ItemLoader(authorItem(), response) for key, value in self.author_css_selectors.items(): item.add_value(key, post.css(value).extract()) yield item.load_item() yield response.follow(item.get_collected_values('author_url')[0] + 'posts/', callback=self.parse_author) yield response.follow(response.css( self.line_post_css_selectors['post_url']).extract()[0], callback=self.parse_post)
def parse(self, response): l = ItemLoader( item=LinksItem(), response=response, ) l.add_value('url', response.meta['source_url']) links = set([]) for n in [1, 2]: selector = 'div.entrytext p:nth-child(%s) a ::attr(href)' % n for link in response.css(selector).extract(): if '/tag/' not in link: links.add(link) l.add_value('links', list(links)) l.add_value('count', len(l.get_collected_values('links'))) yield l.load_item()
def parse_detail(self, response, char): loader = ItemLoader(item=char, response=response) loader.add_value("url", response.url) loader.add_css("image", selectors["CHARACTER_IMAGE"]) loader.add_css("name", selectors["CHARACTER_NAME"]) loader.add_css("feature_films", selectors["CHARACTER_FEATURE_FILMS"]) loader.add_css("short_films", selectors["CHARACTER_SHORT_FILMS"]) loader.add_css("shows", selectors["CHARACTER_SHOWS"]) loader.add_css("games", selectors["CHARACTER_GAMES"]) loader.add_css("rides", selectors["CHARACTER_RIDES"]) loader.add_css("animator", selectors["CHARACTER_ANIMATOR"]) loader.add_css("designer", selectors["CHARACTER_DESIGNER"]) loader.add_css("voice", selectors["CHARACTER_VOICE"]) loader.add_css("portrayed_by", selectors["CHARACTER_PORTRAYED_BY"]) loader.add_css("performance_model", selectors["CHARACTER_PERFORMANCE_MODEL"]) loader.add_css("inspiration", selectors["CHARACTER_INSPIRATION"]) loader.add_css("awards", selectors["CHARACTER_AWARDS"]) loader.add_css("fullname", selectors["CHARACTER_FULLNAME"]) loader.add_css("other_names", selectors["CHARACTER_OTHER_NAMES"]) loader.add_css("occupation", selectors["CHARACTER_OCCUPATION"]) loader.add_css("affiliations", selectors["CHARACTER_AFFILIATIONS"]) loader.add_css("home", selectors["CHARACTER_HOME"]) loader.add_css("likes", selectors["CHARACTER_LIKES"]) loader.add_css("dislikes", selectors["CHARACTER_DISLIKES"]) loader.add_css("powers", selectors["CHARACTER_POWERS"]) loader.add_css("paraphernalia", selectors["CHARACTER_PARAPHERNALIA"]) loader.add_css("status", selectors["CHARACTER_STATUS"]) loader.add_css("parents", selectors["CHARACTER_PARENTS"]) loader.add_css("siblings", selectors["CHARACTER_SIBLINGS"]) loader.add_css("family", selectors["CHARACTER_FAMILY"]) loader.add_css("partner", selectors["CHARACTER_PARTNER"]) loader.add_css("children", selectors["CHARACTER_CHILDREN"]) loader.add_css("pets", selectors["CHARACTER_PETS"]) if len(loader.get_css(selectors["CHARACTER_NAME"])) < 1: loader.add_css("name", selectors["PAGE_HEADER_TITLE"]) if len(loader.get_css(selectors["CHARACTER_IMAGE"])) < 1: loader.add_css("image", selectors["CHARACTER_THUMB_IMAGE"]) logging.info("Crawl %s" % loader.get_collected_values("name")) char = loader.load_item() yield char
def parse_lot(self, response): l = ItemLoader( item=HbarrysmithKaufmanauctionswvAuctionsfirstResultItem(), response=response) l.default_output_processor = TakeFirst() l.add_xpath('LotNum', '//span[@class="lot-num"]/text()') l.add_xpath('Lead', '//span[@class="lot-name"]/text()') l.add_xpath( 'Description', 'string(//div[contains(@class, "description-info-content")])') l.add_xpath( 'Price', '//span[@id and contains(text(), "Lot closed - High bid:")]/span/text()' ) l.add_value('Sale', l.get_collected_values('Price')) yield l.load_item()
def get_news(self, response): l = ItemLoader(item=SpiderItem(), response=response) l.add_value( 'title', response.xpath('//div[@class="article_title"]/text()').extract()) l.add_value( 'date', response.xpath('//div[@class="article_title1"]/text()').extract()) r1 = r"\d{1,4}" date0 = re.compile(r1) date = ''.join(l.get_collected_values('date')) date1 = date0.findall(date) date1 = date1[0] + '-' + date1[1] + '-' + date1[2] l.replace_value('date', date1) l.add_value( 'content', response.xpath('//div[@id="MyContent"]/p/span/text()').extract()) l.add_value( 'content', response.xpath( '//div[@id="MyContent"]/p/font/span/text()').extract()) l.add_value( 'content', response.xpath('//p[@class="MsoNormal"]/span/span/font/span/text()' ).extract()) l.add_value( 'content', response.xpath( '//p[@class="MsoNormal"]/span/span/font/text()').extract()) l.add_value( 'content', response.xpath('//div[@class="article_intro"]/text()').extract()) l.add_value( 'content', response.xpath('//div[@id="MyContent"]/p/font/text()').extract()) l.add_value( 'content', response.xpath('//p[@id="MsoNormal"]/span/text()').extract()) l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) return l.load_item()
def get_news(self, response): l = ItemLoader(item=SpiderItem(), response=response) l.add_value( 'title', response.xpath( '//div[@id="lbyright_xwxq_title"]/text()').extract()) l.add_value( 'date', response.xpath('//div[@id="lbyright_xwxq_xxx"]/text()').extract()) r1 = r"\d{4}\-\d{1,2}\-\d{1,2}" date0 = re.compile(r1) date = ''.join(l.get_collected_values('date')) date1 = date0.findall(date) l.replace_value('date', date1[0]) l.add_value( 'content', response.xpath( '//div[@id="lbyright_xwxq_txt"]/p/span/text()').extract()) l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) return l.load_item()
def get_news(self,response): l = ItemLoader(item=SpiderItem(),response=response) l.add_value('title', response.xpath('//h2[@class="titleH2"]/text()').extract()) l.add_value('title', response.xpath('//div[@class="Article-Left"]/h3/text()').extract()) l.add_value('title', response.xpath('//div[@class="tit"]/h1/text()').extract()) l.add_value('date',response.xpath('//div[@class="from"]/span/text()').extract()) l.add_value('date',response.xpath('//div[@class="CopyFrom"]/text()').extract()) l.add_value('date',response.xpath('//div[@class="auther-from"]/text()').extract()) r1 = r"\d{4}\-\d{1,2}\-\d{1,2}\s\d{2}\:\d{2}\:\d{2}" date0 = re.compile(r1) date = ''.join(l.get_collected_values('date')) date1 = date0.findall(date) l.replace_value('date', date1[0]) l.add_value('content',response.xpath('//div[@class="content"]/p/text()').extract()) l.add_value('content',response.xpath('//div[@class="content"]/p/font/text()').extract()) l.add_value('content',response.xpath('//div[@class="content"]/text()').extract()) l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) return l.load_item()
def get_news(self,response): try: l = ItemLoader(item=SpiderItem(),response=response) l.add_value('title', response.xpath('//div[@class="wh645 left"]/p[1]/text()').extract()) l.add_value('title', response.xpath('//p[@class="f22 lh30 yahei"]/a/text()').extract()) l.add_value('title', response.xpath('//p[@class="f22 lh40 fb"]/text()').extract()) l.add_value('date',response.xpath('//p[@class="lh30 left f14 yahei"]/text()').extract()) l.add_value('date',response.xpath('//span[@id="pubtime_baidu"]/text()').extract()) l.add_value('title', response.xpath('//div[@class="top"]/p/text()').extract()) date = ''.join(l.get_collected_values('date')) #date = time.strptime(date.split()[0], '%Y-%m-%d') #l.replace_value('date', time.strftime('%Y-%m-%d %H:%M:%S', date)) l.add_value('content',response.xpath('//div[@class="TRS_Editor"]/p/text()').extract()) l.add_value('content',response.xpath('//div[@class="TRS_Editor"]/div/p/text()').extract()) l.add_value('content',response.xpath('//div[@class="sanji_left"]/p/text()').extract()) l.add_value('content',response.xpath('//div[@class="news-con"]/div/p/text()').extract()) l.add_value('content',response.xpath('//div[@class="news-con"]/div/div/div/font/text()').extract()) l.add_value('content',response.xpath('//div[@id="news-con"]/div/p/text()').extract()) l.add_value('content',response.xpath('//div[@id="news-con"]/div/font/font/p/text()').extract()) l.add_value('url', response.url) l.add_value('collection_name', self.name) url = response.url return l.load_item() except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) l = ItemLoader(item=SpiderItem(), response=response) l.add_value('title', '') l.add_value('date', '1970-01-01 00:00:00') l.add_value('source', '') l.add_value('content', '') l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) return l.load_item()
def parse_article(self, response): loader = ItemLoader(item=OfficialItem(), response=response) index = response.meta.get("index") title = response.meta.get('title', None) tags_list = response.meta.get('tags_list') block_type = ",".join(tags_list) # 文章中需要提取的信息,标题,详细时间,内容,作者,来源 article = response.xpath("//div[@class='article']") if not title: loader.add_xpath("title", ".//h1[@class='arti-title']//text()") else: loader.add_value("title", title) article_metas = article.xpath( ".//p[@class='arti-metas']//span//text()").extract() loader.add_value("detail_time", article_metas[0]) loader.add_value("author", article_metas[1], re='作者:(.*)') loader.add_value("block_type", block_type) loader.add_value("content", response.xpath("//div[@id='content']")) loader.add_xpath("img", "//div[@id='content']//@src") loader.add_value("article_url", response.url) loader.add_value("tags_list", tags_list) loader.add_value("index", index) imgs = loader.get_collected_values("img") if imgs: for img in imgs: if "http" in img: yield Request(img, callback=self.parse_img, dont_filter=True, meta={ "type": "image", "article_url": response.url }) yield loader.load_item()
def get_news(self,response): try: l = ItemLoader(item=SpiderItem(),response=response) l.add_value('title', response.xpath('//div[@class="layout"]/h2/text()').extract()) l.add_value('title', response.xpath('//div[@id="wrapper"]/h1/text()').extract()) l.add_value('title', response.xpath('//div[@class="top"]/h1/text()').extract()) l.add_value('date',response.xpath('//div[@class="layout"]/div/text()').extract()) l.add_value('date',response.xpath('//div[@class="left"]/span/text()').extract()) l.add_value('title', response.xpath('//div[@class="top"]/p/text()').extract()) date = ''.join(l.get_collected_values('date')) date = time.strptime(date.split()[0], '%Y-%m-%d') l.replace_value('date', time.strftime('%Y-%m-%d %H:%M:%S', date)) l.add_value('content',response.xpath('//div[@class="news-con"]/p/text()').extract()) l.add_value('content',response.xpath('//div[@id="news-con"]/div/div/p/text()').extract()) l.add_value('content',response.xpath('//div[@class="news-con"]/div/p/text()').extract()) l.add_value('content',response.xpath('//div[@class="news-con"]/div/div/div/font/text()').extract()) l.add_value('content',response.xpath('//div[@id="news-con"]/div/p/text()').extract()) l.add_value('content',response.xpath('//div[@id="news-con"]/div/font/font/p/text()').extract()) l.add_value('url', response.url) l.add_value('collection_name', self.name) url = response.url return l.load_item() except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) l = ItemLoader(item=SpiderItem(), response=response) l.add_value('title', '') l.add_value('date', '1970-01-01 00:00:00') l.add_value('source', '') l.add_value('content', '') l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) return l.load_item()
def parse_info(self, response): loaderJob = ItemLoader(item=JobInfoItem(), response=response) loaderJob.add_value("url", value=response.url) loaderJob.add_value("job_category", value=urllib.unquote(response.meta["category"])) loaderJob.add_xpath("job_name", '//div[@class="title-info over"]/h1/text()') loaderJob.add_xpath("job_name", '//div[@class="title-info "]/h1/text()') loaderJob.add_xpath("job_company", '//div[@class="title-info over"]/h3/text()') loaderJob.add_xpath("job_company", '//div[@class="title-info "]/h3/text()') loaderJob.add_xpath("job_company", '//div[@class="title-info "]/h3/a/text()') loaderJob.add_xpath("job_miniEdu", '//div[@class="resume clearfix"]/span/text()', TakeNumL(0)) loaderJob.add_xpath("job_experience", '//div[@class="resume clearfix"]/span/text()', TakeNumL(1)) loaderJob.add_xpath("job_reqLan", '//div[@class="resume clearfix"]/span/text()', TakeNumL(2)) loaderJob.add_xpath("job_reqAge", '//div[@class="resume clearfix"]/span/text()', TakeNumL(3)) loaderJob.add_xpath("job_salary", '//p[@class="job-main-title"]/text()', TakeFirstL()) loaderJob.add_xpath("job_location", '//p[@class="basic-infor"]/span[1]/text()', TakeFirstL()) loaderJob.add_xpath("job_update", '//p[@class="basic-infor"]/span[2]/text()', TakeFirstL(), re=u"(?<=发布于:).*") loaderJob.add_xpath( "job_desc", '//div[@class="content content-word"][1]', RemoveTagsL(), StripBlankL(), JoinL("") ) loaderJob.add_xpath( "job_benefits", '//div[@class="job-main main-message"]', RemoveTagsL(), ReplaceBlank(), re=u"(?<=薪酬福利:)[\s\S]*", ) loaderJob.add_xpath("job_benefits", '//div[@class="tag-list clearfix"]/span/text()', JoinL("|")) yield loaderJob.load_item() if "job.liepin.com" in response.url: loaderCom = ItemLoader(item=ComInfoItem(), response=response) loaderCom.add_value("url", value=response.url) loaderCom.add_value("com_name", value=loaderJob.get_collected_values("job_company")) loaderCom.add_xpath( "com_industry", '//div[@class="right-post-top"]/div[@class="content content-word"]/a[1]/@title', TakeFirstL(), ) loaderCom.add_xpath( "com_size", '//div[@class="right-post-top"]/div[@class="content content-word"]', RemoveTagsL(), re=u"(?<=规模:)[\s\S]*?(?=<br>)", ) loaderCom.add_xpath( "com_nature", '//div[@class="right-post-top"]/div[@class="content content-word"]', RemoveTagsL(), re=u"(?<=性质:)[\s\S]*?(?=<br>)", ) loaderCom.add_xpath( "com_address", '//div[@class="right-post-top"]/div[@class="content content-word"]', RemoveTagsL(), re=u"(?<=地址:)[\s\S]*", ) loaderCom.add_xpath( "com_intro", '//div[@class="job-main main-message noborder "]/div[@class="content content-word"]/text()', StripBlankL(), TakeFirstL(), ) yield loaderCom.load_item()
def parse_post(self, response): ''' parse post does multiple things: 1) loads replied-to-comments page one-by-one (for DFS) 2) call parse_reply on the nested comments 3) adds simple (not-replied-to) comments 4) follows to new comment page ''' #load replied-to comments pages #select nested comment one-by-one matching with the index: response.meta['index'] path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]' + '[' + str( response.meta['index']) + ']' group_flag = response.meta[ 'group'] if 'group' in response.meta else None for reply in response.xpath(path): source = reply.xpath('.//h3/a/text()').extract() answer = reply.xpath( './/a[contains(@href,"repl")]/@href').extract() ans = response.urljoin(answer[::-1][0]) self.logger.info('{} nested comment'.format( str(response.meta['index']))) yield scrapy.Request(ans, callback=self.parse_reply, priority=1000, meta={ 'reply_to': source, 'url': response.url, 'index': response.meta['index'], 'flag': 'init', 'group': group_flag }) #load regular comments if not response.xpath(path): #prevents from exec path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]' for i, reply in enumerate(response.xpath(path2)): self.logger.info('{} regular comment'.format(i + 1)) new = ItemLoader(item=CommentsItem(), selector=reply) new.context['lang'] = self.lang new.add_xpath('source', './/h3/a/text()') new.add_xpath('source_url', './/h3/a/@href') new.add_xpath('text', './/div[h3]/div[1]//text()') new.add_xpath('date', './/abbr/text()') new.add_xpath( 'reactions', './/a[contains(@href,"reaction/profile")]//text()') new.add_value('url', response.url) """ PROFILE REACTIONS SECTION adds functionality for adding profile and specific reaction data gets the profile url, creates a new item if the profile exists, add info to new item and increment 'check' to signal that new information has been added to the item and it's already been yielded repeat this process for reactions """ #profile = response.xpath(".//h3/a/@href") #profile = response.urljoin(profile[0].extract()) profile = "https://mbasic.facebook.com" + new.get_collected_values( 'source_url')[0] #print('profile', profile) #print('new item', new.get_collected_values('name')) item = new.load_item() check = 0 if profile: check += 1 yield scrapy.Request(profile, callback=self.parse_profile, meta={'item': item}) temp = ItemLoader(item=CommentsItem(), selector=reply) temp.context['lang'] = self.lang temp.add_xpath( 'reactions', './/a[contains(@href,"reaction/profile")]/@href') reactions = temp.get_collected_values('reactions') if reactions: check += 1 reactions = "https://mbasic.facebook.com" + temp.get_collected_values( 'reactions')[0] temp = 0 yield scrapy.Request(reactions, callback=self.parse_reactions, meta={'item': item}) if check == 0: yield item #new comment page if not response.xpath(path): #for groups next_xpath = './/div[contains(@id,"see_next")]' prev_xpath = './/div[contains(@id,"see_prev")]' if not response.xpath(next_xpath) or group_flag == 1: for next_page in response.xpath(prev_xpath): new_page = next_page.xpath('.//@href').extract() new_page = response.urljoin(new_page[0]) self.logger.info( 'New page to be crawled {}'.format(new_page)) yield scrapy.Request(new_page, callback=self.parse_post, meta={ 'index': 1, 'group': 1 }) else: for next_page in response.xpath(next_xpath): new_page = next_page.xpath('.//@href').extract() new_page = response.urljoin(new_page[0]) self.logger.info( 'New page to be crawled {}'.format(new_page)) yield scrapy.Request(new_page, callback=self.parse_post, meta={ 'index': 1, 'group': group_flag })
def get_news(self, response): loader = ItemLoader(item=SpiderItem(), response=response) try: loader.add_value( "title", response.xpath( '//h1[@id="articleTitle"]/text()').extract_first()) loader.add_value( "title", response.xpath( '//div[@id="articleTitle"]/text()').extract_first()) loader.add_value( "title", response.xpath( '//h2[@id="toptitle"]/a/text()').extract_first()) loader.add_value( "title", response.xpath( '//div[@class="tit_dt"]/b/text()').extract_first()) loader.add_value( "title", response.xpath( '//div[@id="ArticleTitle"]/text()').extract_first()) loader.add_value( "title", response.xpath( '//h1[@class="picContentHeading"]/text()').extract_first()) date = response.xpath( '//span[@id="pubTime"]/text()').extract_first() if date: loader.add_value("date", date + ":00") loader.add_value( "date", ''.join( response.xpath('//div[@id="ArticleSourceAuthor"]/text()'). extract()).strip()[:19]) if loader.get_collected_values("date") == '': end = response.url.find('/content_') loader.add_value( "date", response.url[end - 10:end].replace('/', '-') + " 00:00:00") loader.add_value( "content", ''.join( response.xpath( '//div[@id="contentMain"]/descendant-or-self::text()'). extract())) loader.add_value( "content", ''.join( response.xpath( '//div[@style="padding:15px 15px;line-height:28px;"]/descendant-or-self::text()' ).extract())) loader.add_value( "content", ''.join( response.xpath('//div[@class="con_dt"]/descendant::text()' ).extract())) loader.add_value( "content", ''.join( response.xpath( '//div[@id="ArticleContent"]/descendant::text()'). extract())) except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) loader.add_value('title', '') loader.add_value('date', '1970-01-01 00:00:00') loader.add_value('content', '') loader.add_value('url', response.url) loader.add_value('collection_name', self.name) loader.add_value('website', self.website) return loader.load_item()
def parse_post(self, response): # Web elements to extract post thread_item = response.meta['thread_item'] thread_loader = ItemLoader(item=thread_item) threadtitle = thread_loader.get_collected_values('threadtitle') for product in response.xpath("//div[contains(@class, 'post_block')]"): loader = ItemLoader(item=PostItem(), selector=product) loader.add_value('threadtitle', threadtitle) temp = product.css( "div.post_block div.post_wrap div.post_body").extract() temp = re.sub('<br>|<strong>|<\/strong>|<em>|<\/em>', ' ', temp[0]) temp = re.sub('\n', ' ', temp) temp = re.sub('<blockquote(.*?)blockquote>', ' ', str(temp)) selector = scrapy.Selector(text=str(temp)) loader.add_value( "postcontent", selector.xpath( "//div[contains(@class,'post_body')]/div[@itemprop='commentText'][1]" ).extract()) loader.add_value( "authorname", product.css( "div.post_wrap div.author_info div.user_details span[itemprop='name']::text" ).get(default='N/A')) loader.add_value( "authortype", product.css( "div.post_wrap div.author_info div.user_details li.group_title::text" ).get()) loader.add_value( "noposts", product.css( "div.post_wrap div.author_info div.user_details li.post_count::text" ).get()) if len( product.css("div.post_wrap div.post_body div.signature"). getall()) > 0: loader.add_value( "authorsign", product.css( "div.post_wrap div.post_body div.signature").getall()) else: loader.add_value("authorsign", ['N/A']) loader.add_value( "date", product.css( "div.post_wrap div.post_body p.posted_info abbr.published::text" ).get()) yield loader.load_item() next_page = response.xpath( "//div[contains(@class, 'topic_controls')]/div[contains(@class, 'pagination')]/" "ul[contains(@class, 'forward')]/li[contains(@class, 'next')]/a/@href" ).extract_first() if next_page is not None: yield response.follow(next_page, callback=self.parse_post, meta={'thread_item': thread_item}) #
def get_news(self, response): try: loader = ItemLoader(item=SpiderItem(), response=response) loader.add_value( 'title', response.xpath( '//div[@class="text"]/h1/text()').extract_first()) loader.add_value( 'title', response.xpath('//div[@class="text_c clearfix"]/h1/text()'). extract_first()) loader.add_value( 'title', response.xpath( '//div[@class="text_c"]/h1/text()').extract_first()) loader.add_value( 'title', response.xpath('//div[@class="d2_left wb_left fl"]/h1/text()'). extract_first()) loader.add_value( 'date', response.xpath( '//p[@class="text_tools"]/text()').extract_first()) loader.add_value( 'date', response.xpath('////div[@class="text_c clearfix"]/h5/text()'). extract_first()) loader.add_value( 'date', response.xpath('//p[@class="sou"]/text()').extract_first()) loader.add_value( 'date', response.xpath( '//span[@id="p_publishtime"]/text()').extract_first()) date = ''.join(loader.get_collected_values('date')) date = time.strptime(date.split()[0], '%Y年%m月%d日%H:%M') loader.replace_value('date', time.strftime('%Y-%m-%d', date)) loader.add_value( 'content', ''.join( response.xpath( '//div[@class="text_c"]/descendant-or-self::text()'). extract())) loader.add_value( 'content', ''.join( response.xpath( '//div[@class="text_show"]/descendant-or-self::text()' ).extract())) loader.add_value( 'content', ''.join( response.xpath( '//div[@class="show_text"]/descendant-or-self::text()' ).extract())) loader.add_value( 'content', ''.join( response.xpath( '//div[@id="p_content"]/descendant-or-self::text()'). extract())) except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) l = ItemLoader(item=SpiderItem(), response=response) l.add_value('title', '') l.add_value('date', '1970-01-01') l.add_value('source', '') l.add_value('content', '') l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) return l.load_item()
def parse_reply(self, response): ''' parse reply to comments, root comment is added if flag ''' # from scrapy.utils.response import open_in_browser # open_in_browser(response) if response.meta['flag'] == 'init': #parse root comment for root in response.xpath( '//div[contains(@id,"root")]/div/div/div[count(@id)!=1 and contains("0123456789", substring(@id,1,1))]' ): new = ItemLoader(item=CommentsItem(), selector=root) new.context['lang'] = self.lang new.add_xpath('source', './/h3/a/text()') new.add_xpath('source_url', './/h3/a/@href') new.add_value('reply_to', 'ROOT') new.add_xpath('text', './/div[1]//text()') new.add_xpath('date', './/abbr/text()') new.add_xpath( 'reactions', './/a[contains(@href,"reaction/profile")]//text()') new.add_value('url', response.url) #response --> reply/root """ PROFILE REACTIONS SECTION (REPEAT SEE LINE 176 ) the only difference is that, when getting the item temporarily the selector is the root instead of the reply, (it matches the for loop) """ #profile = response.xpath(".//h3/a/@href") #profile = response.urljoin(profile[0].extract()) profile = "https://mbasic.facebook.com" + new.get_collected_values( 'source_url')[0] print('profile', profile) #print('new item', new.get_collected_values('name')) item = new.load_item() check = 0 if profile: check += 1 yield scrapy.Request(profile, callback=self.parse_profile, meta={'item': item}) #reactions = new.get_value('reactions') #print("reactions",reactions) temp = ItemLoader(item=CommentsItem(), selector=root) temp.context['lang'] = self.lang temp.add_xpath( 'reactions', './/a[contains(@href,"reaction/profile")]/@href') reactions = temp.get_collected_values('reactions') if reactions: check += 1 reactions = "https://mbasic.facebook.com" + temp.get_collected_values( 'reactions')[0] temp = 0 yield scrapy.Request(reactions, callback=self.parse_reactions, meta={'item': item}) if check == 0: yield item #parse all replies in the page for reply in response.xpath( '//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]' ): new = ItemLoader(item=CommentsItem(), selector=reply) new.context['lang'] = self.lang new.add_xpath('source', './/h3/a/text()') new.add_xpath('source_url', './/h3/a/@href') new.add_value('reply_to', response.meta['reply_to']) new.add_xpath('text', './/div[h3]/div[1]//text()') new.add_xpath('date', './/abbr/text()') new.add_xpath( 'reactions', './/a[contains(@href,"reaction/profile")]//text()') new.add_value('url', response.url) """ PROFILE REACTIONS SECTION SECTION (REPEAT SEE LINE 176) """ #profile = response.xpath(".//h3/a/@href") #profile = response.urljoin(profile[0].extract()) profile = "https://mbasic.facebook.com" + new.get_collected_values( 'source_url')[0] #print('new item', new.get_collected_values('name')) item = new.load_item() check = 0 if profile: check += 1 yield scrapy.Request(profile, callback=self.parse_profile, meta={'item': item}) temp = ItemLoader(item=CommentsItem(), selector=reply) temp.context['lang'] = self.lang temp.add_xpath( 'reactions', './/a[contains(@href,"reaction/profile")]/@href') reactions = temp.get_collected_values('reactions') if reactions: check += 1 reactions = "https://mbasic.facebook.com" + temp.get_collected_values( 'reactions')[0] temp = 0 yield scrapy.Request(reactions, callback=self.parse_reactions, meta={'item': item}) if check == 0: yield item back = response.xpath( '//div[contains(@id,"comment_replies_more_1")]/a/@href' ).extract() if back: self.logger.info('Back found, more nested comments') back_page = response.urljoin(back[0]) yield scrapy.Request(back_page, callback=self.parse_reply, priority=1000, meta={ 'reply_to': response.meta['reply_to'], 'flag': 'back', 'url': response.meta['url'], 'index': response.meta['index'], 'group': response.meta['group'] }) else: next_reply = response.meta['url'] self.logger.info( 'Nested comments crawl finished, heading to proper page: {}' .format(response.meta['url'])) yield scrapy.Request(next_reply, callback=self.parse_post, meta={ 'index': response.meta['index'] + 1, 'group': response.meta['group'] }) elif response.meta['flag'] == 'back': """ adds random time pauses to prevent blocking DOWNSIDE: the algorithm will go slower, but still runs pretty quickly the greater the length of time, the more likely you'll go undetected, but if you're using a large amount of data, this may be unreasonable """ #print("did we make it") r = randrange(0, 20) time.sleep(r) #parse all comments for reply in response.xpath( '//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]' ): #print("reply") new = ItemLoader(item=CommentsItem(), selector=reply) new.context['lang'] = self.lang new.add_xpath('source', './/h3/a/text()') new.add_xpath('source_url', './/h3/a/@href') new.add_value('reply_to', response.meta['reply_to']) new.add_xpath('text', './/div[h3]/div[1]//text()') new.add_xpath('date', './/abbr/text()') new.add_xpath( 'reactions', './/a[contains(@href,"reaction/profile")]//text()') new.add_value('url', response.url) """ SECTION (REPEAT SEE LINE 176) """ profile = "https://mbasic.facebook.com" + new.get_collected_values( 'source_url')[0] #profile = response.xpath(".//h3/a/@href") #profile = response.urljoin(profile[0].extract()) #print('profile', profile) #print('new item', new.get_collected_values('name')) check = 0 item = new.load_item() if profile: check += 1 print(1) yield scrapy.Request(profile, callback=self.parse_profile, meta={'item': item}) #response --> reply/root #print("before ", item) temp = ItemLoader(item=CommentsItem(), selector=reply) temp.context['lang'] = self.lang temp.add_xpath( 'reactions', './/a[contains(@href,"reaction/profile")]/@href') reactions = temp.get_collected_values('reactions') if reactions: check += 1 reactions = "https://mbasic.facebook.com" + temp.get_collected_values( 'reactions')[0] temp = 0 print(2) yield scrapy.Request(reactions, callback=self.parse_reactions, meta={'item': item}) if check == 0: print(3) yield item #print("after ", item) #keep going backwards back = response.xpath( '//div[contains(@id,"comment_replies_more_1")]/a/@href' ).extract() self.logger.info('Back found, more nested comments') if back: back_page = response.urljoin(back[0]) yield scrapy.Request(back_page, callback=self.parse_reply, priority=1000, meta={ 'reply_to': response.meta['reply_to'], 'flag': 'back', 'url': response.meta['url'], 'index': response.meta['index'], 'group': response.meta['group'] }) else: next_reply = response.meta['url'] self.logger.info( 'Nested comments crawl finished, heading to home page: {}'. format(response.meta['url'])) yield scrapy.Request(next_reply, callback=self.parse_post, meta={ 'index': response.meta['index'] + 1, 'group': response.meta['group'] })
def getInfo(self, res): if not mch(res): return response = etree.HTML(res.text) loader = ItemLoader(item=booking.Booking(), response=res) supplier_obj_id = res.meta.get('statics.hotels.id') supplier_name = res.meta.get('statics.hotels.supplier') if supplier_obj_id: loader.add_value('statics_hotels_id', supplier_obj_id) loader.add_value('statics_hotels_supplier', supplier_name) pic = [] for e in self.allXpath: Xpath = eval('bk.' + e) fielName, lable = '_'.join(e.split('_')[:-1]), e.split('_')[-1] tempResult = '' if lable == 'non': if response.xpath(Xpath): tempResult = response.xpath(Xpath)[0].strip() elif lable == 'ren': if re.findall(Xpath, res.text): tempResult = re.findall(Xpath, res.text)[0].strip() elif lable == 'rea': if re.findall(Xpath, res.text): for each in re.findall(Xpath, res.text): tempResult += each.strip() elif lable == 'sub': if response.xpath(Xpath): tempResult = re.sub( '\\n+', '\\n', response.xpath(Xpath)[0].xpath('string(.)')).strip() elif lable == 'sua': selects, subSelcets, y = Xpath.split('weego')[0], Xpath.split( 'weego')[1], Xpath.split('weego')[2:] for each in response.xpath(selects): temp = each.xpath(subSelcets) if isinstance(temp, list): tempResult += temp[0] elif isinstance(temp, str): tempResult += temp tempResult = re.sub('\\n+', '\\n', tempResult).strip() elif lable == 'pic': selects, subSelcets, y = Xpath.split('weego')[0], Xpath.split( 'weego')[1], Xpath.split('weego')[2:] for each in response.xpath(selects): temp = each.xpath(subSelcets) pic.append(temp[0]) tempResult = pic elif lable == 'pir': for each in re.findall(Xpath, res.text): pic.append(each) tempResult = pic elif lable == 'xpl': selects, subSelcets, y = Xpath.split('weego')[0], Xpath.split( 'weego')[1], Xpath.split('weego')[2:] tl = [] for each in response.xpath(selects): temp = re.sub('\\n+', ' - ', each.xpath(subSelcets).strip()) tl.append(temp) loader.add_value(fielName.lower(), tl) if lable != 'xpl': if loader.get_collected_values(fielName.lower()): if loader.get_collected_values(fielName.lower())[0] == '': loader.replace_value(fielName.lower(), tempResult) else: loader.add_value(fielName.lower(), tempResult) yield loader.load_item()
def parse_course(self, response): l = ItemLoader(item=ConestogacCourseItem(), response=response) l.default_output_processor = TakeFirst() course_data = response.xpath('//div[@data-accordion][1]') l.add_value('institution_name', 'Conestoga College') l.add_xpath('course_code', '//div[@class="hero-banner"]//span/text()') l.add_xpath('course_name', '//h1[contains(@class, "text-white")]/text()') l.add_value( 'delivery_types', course_data.xpath( './/small[strong[contains(text(), "Delivery:")]]/following-sibling::small/text()' ).get()) l.add_value('url', response.url) # l.add_value('faculty', '???????????') l.add_xpath( 'description', '//h2[contains(text(), "Course description")]/following-sibling::p[1]/text()' ) price = course_data.xpath( './/small[strong[contains(text(), "Cost:")]]/following-sibling::small/text()' ).get() if price: price = price.lstrip('$') else: price = '0.0' l.add_value('price', [price]) weekday_time_data = course_data.xpath( './/small[strong[contains(text(), "Day/Time:")]]/following-sibling::small/text()' ).getall() if not weekday_time_data: return False weekday_time_data = [ remove_garbage(data) for data in weekday_time_data ] # ['Thurs. 9:00am – 4:00pm', 'Fri. 9:00am – 4:00pm'] weekday_time_data = [ data for data in weekday_time_data if len(data) > 1 ] if weekday_time_data: weekdays = [ re.search(r'(^\w+)', d).group(1) if re.search(r'(^\w+)', d) else '' for d in weekday_time_data ] weekdays = [d for d in weekdays if d] else: weekdays = [] l.add_value('days', [weekdays]) l.add_value( 'prerequisite', response.xpath( '//strong[contains(text(), "Prerequisites:")]/following-sibling::a/text()' ).getall()) l.add_value( 'corequisites', response.xpath( '//strong[contains(text(), "Corequisites:")]/following-sibling::a/text()' ).getall()) l.add_value('program', 'Continuing Education') if weekday_time_data: duration_hours_list = [ re.findall(r'\d{1,2}:\d{1,2}\w{2}', t) for t in weekday_time_data ] else: duration_hours_list = [] l.add_value('duration_hours', duration_hours_list) l.add_value('duration_days_week', l.get_collected_values('days')) start_date = course_data.xpath( './/small[strong[contains(text(), "Start Date:")]]/following-sibling::small/text()' ).get() if start_date: start_date = re.sub(r'(\s*\.\s+|\s*,\s+)', '-', start_date) start_date = datetime.strptime(start_date, '%b-%d-%Y') end_date = course_data.xpath( './/small[strong[contains(text(), "End date:")]]/following-sibling::small/text()' ).get() if start_date: end_date = re.sub(r'(\s*\.\s+|\s*,\s+)', '-', end_date) end_date = datetime.strptime(end_date, '%b-%d-%Y') duration_month_list = [[start_date, end_date]] l.add_value('duration_months', duration_month_list) l.add_value('duration_as_string', [ l.get_collected_values('duration_hours'), l.get_collected_values('duration_days_week'), l.get_collected_values('duration_months'), ]) hours_site = course_data.xpath( './/small[strong[contains(text(), "Hours:")]]/following-sibling::small/text()' ).get() if not hours_site: hours_site = 0 l.add_value('total_hours', [ l.get_collected_values('duration_hours'), l.get_collected_values('duration_days_week'), hours_site, ]) yield l.load_item()
def parse_program(self, response): programs = response.xpath('//div[h2[@id]]') for program_block in programs: program = program_block.xpath('./h2/text()').get() program_block_html_string = program_block.get() program_block_html_string = re.sub(r'^\s*<div>\s*', '', program_block_html_string) program_block_html_string = re.sub(r'\s*</div>\s*$', '', program_block_html_string) courses = program_block_html_string.split('<hr class="modest">') courses = [el for el in courses if el] for course_html in courses: course = Selector(text=course_html) l = ItemLoader(item=CamosunCourseItem()) # l.default_input_processor = MapCompose(lambda x: x.strip()) l.default_output_processor = Join(' | ') course = course.xpath( '//h3[@id and not(following-sibling::p[contains(@class, "alert-info")]) and not(following-sibling::del)]' ) # If in block tere is no matching h3 element skip this element if not course: continue l.add_value('institution_name', 'Camosun College') l.add_value('course_code', course.xpath('./@id').get()) l.add_value('course_name', course.xpath('./text()').get()) l.add_value('delivery_types', 'Onsite') l.add_value('url', response.url) l.add_value('faculty', response.meta['faculty']) l.add_value( 'description', course.xpath('./following-sibling::p[1]//text()').getall()) ul_blocks = course.xpath( './following-sibling::ul[contains(string(), "$")]') # Skip course if no ul block with days and price if not ul_blocks: continue ul_data = [] dates_data = [] for ul in ul_blocks: # Parse weekdays and times ul_string = remove_tags(ul.get()) ul_string = re.sub(r'\s{2,}', ' ', ul_string) ul_string = remove_garbage(ul_string) ul_string = ul_string.strip() ul_data.append(ul_string) # Parse dates text node date_string = ul.xpath( './preceding-sibling::text()[1]').get('') date_string = remove_garbage(date_string) # 1s check get we dates or just catch the bullets if len(date_string) < 5: date_string = ul.xpath( '(./preceding-sibling::text()[2])').get('') date_string = remove_garbage(date_string) # Remove garbage till 2019 re_search = re.search(r'^(.+)2019', date_string) if re_search: remove_pattern = re.escape(re_search.group(1)) date_string = re.sub(remove_pattern, '', date_string) # Write to list of dates only string that contains 2019 if '2019' in date_string: dates_data.append(date_string.strip()) prices = [ re.search(r'\$(\d+)', p).group(1) if re.search( r'\$(\d+)', p) else '0.0' for p in ul_data if p ] l.add_value('price', prices) # l.add_value('subject', ul_data) # Get strings weekdays # Remove string not containing time weekdays = [ wd if re.search(r'\d+:\d+\w{2}', wd) else '' for wd in ul_data if wd ] # Get string with weekday weekdays = [ re.search(r'^[^\d]+', wd).group() if re.search( r'^[^\d]+', wd) else [] for wd in weekdays if wd ] # Clear from bullets at the end of string weekdays = [re.sub(r'\W+$', '', i) for i in weekdays if i] # Clear from empty string after above clearing weekdays = [wd.split(' ') for wd in weekdays if wd] l.add_value('days', weekdays) l.add_value('program', program) # Get time in gropu like DD:DDam-DD:DDam duration_hours = [ re.findall(r'(\d+:\d+\w{2}-\d+:\d+\w{2})', tm) for tm in ul_data if tm ] # Plepare list for time like [['6:30pm', '9:30pm'], ['8:30am', '4:30pm']] # duration_hours = [tm[0].split('-') for tm in duration_hours if tm] duration_hours_list = [] for tm in duration_hours: if not tm: continue if len(tm) > 1: for interval in tm: duration_hours_list.append(interval.split('-')) else: duration_hours_list.append(tm[0].split('-')) l.add_value('duration_hours', duration_hours_list) l.add_value('duration_days_week', l.get_collected_values('days')) # Looking for month interval duration_month_list = [] dur_month_tpl = '{year} {month}' for mon in dates_data: if not mon: continue mon_res = re.search( r'(2019).+(\w{3} \d+) - (\w{3} \d+)?|(2019).+(\w{3} \d+)', mon) if not mon_res: continue year, start_m, end_m, one_year, one_m = mon_res.groups() if one_m: m_start = dur_month_tpl.format(year=one_year, month=one_m) m_end = dur_month_tpl.format(year=one_year, month=one_m) else: m_start = dur_month_tpl.format(year=year, month=start_m) m_end = dur_month_tpl.format(year=year, month=end_m) duration_month_list.append([m_start, m_end]) l.add_value('duration_months', duration_month_list) l.add_value('duration_as_string', [ l.get_collected_values('duration_hours'), l.get_collected_values('duration_days_week'), l.get_collected_values('duration_months'), ]) l.add_value('total_hours', [ l.get_collected_values('duration_hours'), l.get_collected_values('duration_days_week'), ]) # l.add_value('corequisites', dates_data) yield l.load_item()