def parse_detail(self, response): sel = Selector(response) news_item = CrawlerAffairItem() spider_time = str(int(time.time())) publish_time = sel.xpath('//*[@id="pubtime_baidu"]/text()').extract() if len(publish_time) > 0: publish_time = publish_time[0] else: publish_time = None title = sel.xpath('//h1[@class="articleTitle"]/text()').extract() contents = sel.xpath('//*[@id="articleBody"]/p/text()').extract() labels = sel.xpath('//*[@id="articleKeywords"]/a/text()').extract() news_item["spider_time"] = spider_time news_item["publish_time"] = process_time(publish_time) news_item["title"] = process_title(title) news_item['label'] = process_label(labels) news_item["content"] = process_content(contents) news_item['url'] = sel.response.url.strip() return news_item
def parse_detail(self, response): news_item = CrawlerAffairItem() spider_time = str(int(time.time())) self.detail_browser.get(response.url) # wait = WebDriverWait(self.detail_browser, 1) # wait.until(EC.element_to_be_clickable((By.XPATH, '/html/head/meta[contains(@name, "apub:time")]'))) time.sleep(0.5) publish_time_element = self.detail_browser.find_element_by_xpath( '/html/head/meta[contains(@name, "apub:time")]') publish_time = publish_time_element.get_attribute("content") title_elements = self.detail_browser.find_elements_by_xpath( '//div[@class="qq_conent clearfix"]/div[@class="LEFT"]/h1') title = [t.text for t in title_elements] contents_element = self.detail_browser.find_elements_by_xpath( '//div[@class="content-article"]/p') contents = [content.text for content in contents_element] labels = [] news_item["spider_time"] = spider_time news_item["publish_time"] = process_time(publish_time) news_item["title"] = process_title(title) news_item["content"] = process_content(contents) news_item["label"] = process_label(labels) news_item['url'] = response.url.strip() return news_item
def parse_detail(self, response): sel = Selector(response) news_item = CrawlerAffairItem() spider_time = str(int(time.time())) publish_time = sel.xpath( '//div[@class="big_img"]/div[@class="more"]/text()').extract() if len(publish_time) > 0: publish_time = publish_time[0] else: publish_time = None title = sel.xpath('//div[@class="big_img"]/h1/text()').extract() contents = sel.xpath('//*[@id="content"]/p/text()').extract() labels = [] news_item["spider_time"] = spider_time news_item["publish_time"] = process_time(publish_time) news_item["title"] = process_title(title) news_item["label"] = process_label(labels) news_item["content"] = process_content(contents) news_item['url'] = sel.response.url.strip() return news_item
def parse_detail(self, response): sel = Selector(response) news_item = CrawlerAffairItem() spider_time = str(int(time.time())) publish_time = sel.xpath( '//div[@class="article-info"]/p/span[@class="article-timestamp ml10"]/text()' ).extract() if len(publish_time) > 0: publish_time = publish_time[0] else: publish_time = None title = sel.xpath('//div[@class="article-title"]/h1/text()').extract() contents = sel.xpath( '//div[@class="article-content"]/p/text()').extract() labels = sel.xpath( '//div[@class="fl ml10 article-tags"]/a/text()').extract() news_item["spider_time"] = spider_time news_item["publish_time"] = process_time(publish_time) news_item["title"] = process_title(title) news_item["label"] = process_label(labels) news_item["content"] = process_content(contents) news_item['url'] = sel.response.url.strip() return news_item
def parse_detail(self, response): news_item = CrawlerAffairItem() spider_time = str(int(time.time())) self.sub_browser.get(response.url) time.sleep(3) try: publish_time = self.sub_browser.find_element_by_xpath( '//div[@class="tit"]/h2/b').text title_elements = self.sub_browser.find_elements_by_xpath( '//div[@class="tit"]/h3') title = [t.text for t in title_elements] contents_element = self.sub_browser.find_elements_by_xpath( '//div[@class="viewcontent"]') contents = [content.text for content in contents_element] labels = [] except Exception as e: raise e news_item["spider_time"] = spider_time news_item["publish_time"] = process_time(publish_time) news_item["title"] = process_title(title) news_item["content"] = process_content(contents) news_item["label"] = process_label(labels) news_item['url'] = response.url.strip() return news_item
def parse_detail(self, response): sel = Selector(response) news_item = CrawlerAffairItem() spider_time = str(int(time.time())) # '/html/body/div[2]/div[3]/div/div[1]' publish_time = sel.xpath( '//div[@class="newscontent"]/div[@class="news_about"]/p/text()' ).extract() title = sel.xpath('//div[@class="newscontent"]/h1/text()').extract() contents = sel.xpath('//div[@class="news_txt"]/text()').extract() labels = [] if len(sel.xpath('//div[@class="news_keyword"]/text()').extract()) > 0: labels = sel.xpath('//div[@class="news_keyword"]/text()' ).extract_first().split('>>')[-1].split(',') news_item["spider_time"] = spider_time news_item["publish_time"] = process_time(publish_time) news_item["title"] = process_title(title) news_item["label"] = process_label(labels) news_item["content"] = process_content(contents) news_item['url'] = sel.response.url.strip() return news_item
def parse_detail(self, response): sel = Selector(response) news_item = CrawlerAffairItem() spider_time = str(int(time.time())) # '/html/body/div[2]/div[3]/div/div[1]' self.sub_browser.get(response.url) # wait = WebDriverWait(self.browser, 1) # wait.until(EC.element_to_be_clickable((By.XPATH, '//div[@class="inner-content"]/div[@class="show_title"]'))) time.sleep(0.5) publish_time_element = self.sub_browser.find_elements_by_xpath( '//div[@class="inner-content"]/div[@class="show_time"]/div/div[2]') publish_time = [time.text for time in publish_time_element] title_element = self.sub_browser.find_elements_by_xpath( '//div[@class="inner-content"]/div[@class="show_title"]') title = [t.text for t in title_element] contents_element = self.sub_browser.find_elements_by_xpath( '//div[@class="inner-content"]/div[@class="show_content"]/p') contents = [c.text for c in contents_element] labels = [] news_item["spider_time"] = spider_time news_item["publish_time"] = process_time(publish_time) news_item["title"] = process_title(title) news_item["label"] = process_label(labels) news_item["content"] = process_content(contents) news_item['url'] = sel.response.url.strip() return news_item
def parse_detail(self, response): news_item = CrawlerAffairItem() spider_time = str(int(time.time())) self.sub_browser.get(response.url) # time.sleep(0.5) # time.sleep(1) wait = WebDriverWait(self.sub_browser, 1) wait.until( EC.element_to_be_clickable( (By.XPATH, '//div[@id="text_block"]/div[@id="content_detail"]'))) publish_time = self.sub_browser.find_element_by_xpath( '//div[@id="text_block"]/div[@id="content_detail"]').text title_elements = self.sub_browser.find_elements_by_xpath( '//div[@id="text_block"]/div[@class="title_bar"]') title = [t.text for t in title_elements] contents_element_1 = self.sub_browser.find_elements_by_xpath( '//*[@id="content_detail"]/p') contents = [content.text for content in contents_element_1] contents_element_2 = self.sub_browser.find_elements_by_xpath( '//*[@id="zoom"]/p') contents_2 = [content.text for content in contents_element_2] contents.extend(contents_2) contents_element_3 = self.sub_browser.find_elements_by_xpath( '//*[@id="content_detail"]/table/tbody/tr/td/p') contents_3 = [content.text for content in contents_element_3] contents.extend(contents_3) contents_element_4 = self.sub_browser.find_elements_by_xpath( '//*[@id="content_detail"]/font') contents_4 = [content.text for content in contents_element_4] contents.extend(contents_4) contents_element_5 = self.sub_browser.find_elements_by_xpath( '//*[@id="content_detail"]') contents_5 = [content.text for content in contents_element_5] contents.extend(contents_5) contents_element_6 = self.sub_browser.find_elements_by_xpath( '//*[@id="content_detail"]/table/tbody/tr/td[@class="detail"]/p') contents_6 = [content.text for content in contents_element_6] contents.extend(contents_6) labels = [] news_item["spider_time"] = spider_time news_item["publish_time"] = process_time(publish_time) news_item["title"] = process_title(title) news_item["label"] = process_label(labels) news_item["content"] = process_content(contents) news_item['url'] = response.url.strip() return news_item
def parse_detail(self, response): news_item = CrawlerAffairItem() spider_time = str(int(time.time())) self.detail_browser.get(response.url) wait = WebDriverWait(self.detail_browser, 1) wait.until( EC.element_to_be_clickable(( By.XPATH, '//div[@id="top_bar"]/div/div[@class="date-source"]/span[@class="date"]' ))) publish_time_element_0 = self.detail_browser.find_elements_by_xpath( '//div[@id="top_bar"]/div/div[@class="date-source"]/span[@class="date"]' ) publish_time = [content.text for content in publish_time_element_0] publish_time_element_1 = self.detail_browser.find_elements_by_xpath( '//div[@class="page-info"]/span[@class="time-source"]') publish_time_1 = [content.text for content in publish_time_element_1] publish_time.extend(publish_time_1) publish_time = ''.join(publish_time) title_elements_0 = self.detail_browser.find_elements_by_xpath( '//h1[@class="main-title"]') title = [t.text for t in title_elements_0] title_elements_1 = self.detail_browser.find_elements_by_xpath( '//div[@class="page-header"]/h1') title_1 = [t.text for t in title_elements_1] title.extend(title_1) contents_element = self.detail_browser.find_elements_by_xpath( '//div[@id="artibody"]/p') contents = [content.text for content in contents_element] contents_element_1 = self.detail_browser.find_elements_by_xpath( '//div[@id="artibody"]/div[@class="detail_txt"]') contents_1 = [content.text for content in contents_element_1] contents.extend(contents_1) contents_element_2 = self.detail_browser.find_elements_by_xpath( '//div[@id="article"]/p') contents_2 = [content.text for content in contents_element_2] contents.extend(contents_2) labels = [] news_item["spider_time"] = spider_time news_item["publish_time"] = process_time(publish_time) news_item["title"] = process_title(title) news_item["content"] = process_content(contents) news_item["label"] = process_label(labels) news_item['url'] = response.url.strip() return news_item
def parse_detail(self, response): sel = Selector(response) news_item = CrawlerAffairItem() spider_time = str(int(time.time())) publish_time = sel.xpath('//div[@class ="box01"]/div[@class="fl"]/text()').extract_first() title = sel.xpath('//div[@class="clearfix w1000_320 text_title"]/h1/text()').extract() contents = sel.xpath('//*[@id="rwb_zw"]/p/text()').extract() labels = [] news_item["spider_time"] = spider_time news_item["publish_time"] = process_time(publish_time) news_item["title"] = process_title(title) news_item["label"] = process_label(labels) news_item["content"] = process_content(contents) news_item['url'] = sel.response.url.strip() return news_item
def parse_detail(self, response): sel = Selector(response) news_item = CrawlerAffairItem() spider_time = str(int(time.time())) # '/html/body/div[2]/div[3]/div/div[1]' publish_time = sel.xpath( '//div[@class="function"]/span[@class="info"]/i/text()').extract() title = sel.xpath('//div[@class="cnt_bd"]/h1/text()').extract() contents = sel.xpath('//div[@class="cnt_bd"]/p/text()').extract() labels = [] news_item["spider_time"] = spider_time news_item["publish_time"] = process_time(publish_time) news_item["title"] = process_title(title) news_item["label"] = process_label(labels) news_item["content"] = process_content(contents) news_item['url'] = sel.response.url.strip() return news_item
def parse_detail(self, response): sel = Selector(response) news_item = CrawlerAffairItem() spider_time = str(int(time.time())) # '/html/body/div[2]/div[3]/div/div[1]' publish_time = sel.xpath( '//div[@class="title_area"]/div[@class="info1"]/text()' ).extract_first() title = sel.xpath('//div[@class="title_area"]/h1/text()').extract() contents = sel.xpath('//div[@class="content_area"]/p/text()').extract() labels = sel.xpath('//ul[@id="searchkeywords"]/li/a/text()').extract() news_item["spider_time"] = spider_time news_item["publish_time"] = process_time(publish_time) news_item["title"] = process_title(title) news_item["label"] = process_label(labels) news_item["content"] = process_content(contents) news_item['url'] = sel.response.url.strip() return news_item
def parse_detail(self, response): sel = Selector(response) news_item = CrawlerAffairItem() spider_time = str(int(time.time())) publish_time = sel.xpath( '//div[@class="pages-date"]/text()').extract_first() # process title title = [] title_0 = sel.xpath( '//div[@class="article oneColumn pub_border"]/h1/text()').extract( ) title_1 = sel.xpath('//div[@class="pages-title"]/text()').extract() if len(title_0) > 0: title = title_0 elif len(title_1) > 0: title = title_1 contents = sel.xpath( '//div[@class="pages_content"]/p/text()').extract() contents_1 = sel.xpath( '//*[@id="UCAP-CONTENT"]/p/span/span/text()').extract() contents.extend(contents_1) contents_2 = sel.xpath( '//div[@class="pages_content"]/p/span/text()').extract() contents.extend(contents_2) # print(contents) labels = [] news_item["spider_time"] = spider_time news_item["publish_time"] = process_time(publish_time) news_item["title"] = process_title(title) news_item["label"] = process_label(labels) news_item["content"] = process_content(contents) news_item['url'] = response.url.strip() return news_item