def second_requests(self, detail_urls): for detail_url in detail_urls: response = Req(detail_url).get_select() selector = etree.HTML(response.content) item = dict() item['link'] = detail_url item['title'] = xpath_out( selector.xpath( '/html/body/div[2]/div[2]/div[1]/div[1]/h1/text()')) if item['title']: item['datetime'] = xpath_out( selector.xpath( '/html/body/div[2]/div[2]/div[1]/div[1]/div[1]/span[1]/text()' )) else: item['title'] = xpath_out( selector.xpath( '/html/body/div[2]/div[3]/div[1]/div[2]/h1/text()')) if item['title']: item['datetime'] = xpath_out( selector.xpath( '/html/body/div[2]/div[3]/div[1]/div[2]/div[1]/span[1]/text()' )) else: item['title'] = xpath_out( selector.xpath( '/html/body/div[4]/div/div[2]/div[1]/h1/text()')) item['datetime'] = xpath_out( selector.xpath('//*[@id="time"]/text()')) yield item
def first_requests(self): response = Req(self.url, proxy=True).get_select() selector = etree.HTML(response.text) set = [] partA = selector.xpath('//div[@class="column text-align-left visible-desktop visible-mobile last-column"]/div[@class="column-tout "]') partB = selector.xpath('//div[@class="column large-headline"]/div[@class="column-tout "]') partC = selector.xpath('//div[@class="column column-feed"]/div[@class="column-tout "]') for part in partA: item = dict() item['title'] = xpath_out(part.xpath('div[1]/a/text()')).strip() item['link'] = "http://www.fortune.com/" + xpath_out(part.xpath('div[1]/a/@href')) if not item['link'] in set: set.append(item['link']) yield item for part in partB: item = dict() item['title'] = xpath_out(part.xpath('div[1]/a/text()')).strip() item['link'] = "http://www.fortune.com/" + xpath_out(part.xpath('div[1]/a/@href')) if not item['link'] in set: set.append(item['link']) yield item for part in partC: item = dict() item['title'] = xpath_out(part.xpath('div[1]/a/text()')).strip() item['link'] = "http://www.fortune.com/" + xpath_out(part.xpath('div[1]/a/@href')) if not item['link'] in set: set.append(item['link']) yield item
def first_requests(self): response = Req(self.url, proxy=True).get_select() selector = etree.HTML(response.text) partA = selector.xpath( '//div[@class="column text-align-left visible-desktop visible-mobile last-column"]/div[@class="column-tout "]' ) partB = selector.xpath( '//div[@class="column text-align-left visible-desktop"]/div[@class="column-tout "]' ) for part in partA: item = dict() item['title'] = xpath_out( part.xpath('div[@class="column-tout-info "]/div/div/a/text()') ).strip() item['link'] = "http://time.com" + xpath_out( part.xpath('div[@class="column-tout-info "]/div/div/a/@href')) yield item for part in partB: item = dict() item['title'] = xpath_out( part.xpath( 'div[@class="column-tout-info "]/div/div[1]/a/text()') ).strip() item['link'] = "http://time.com" + xpath_out( part.xpath( 'div[@class="column-tout-info "]/div/div[1]/a/@href')) yield item
def first_requests(self): selector = etree.HTML(Req(self.url, proxy=True).get_select().text) partA = selector.xpath( '//div[@class="module__content"]/ul[@class="media-list"]/li') partB = selector.xpath( '//div[@class="module__content"]/ul[@class="media-list media-list--fixed-height"]/li' ) for part in partA: item = dict() item['title'] = xpath_out( part.xpath('div/div[@class="media__content"]/h3/a/text()')) if item['title'] is not None: item['title'] = item['title'].strip() item['link'] = xpath_out( part.xpath('div/div[@class="media__content"]/h3/a/@href')) yield item for part in partB: item = dict() item['title'] = xpath_out( part.xpath('div/div[@class="media__content"]/h3/a/text()')) if item['title'] is not None: item['title'] = item['title'].strip() item['link'] = xpath_out( part.xpath('div/div[@class="media__content"]/h3/a/@href')) yield item
def third_requests(self, url): content_api = 'https://openapi.inews.qq.com/getQQNewsNormalContent' #腾讯异步请求新闻内容的url pattern_id = re.compile(r'.*?/([\s\w]*)$', re.S) headers = { 'referer': url, 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3610.2 Safari/537.36' } params = { 'id': re.match(pattern_id, url).group(1), 'chlid': 'news_rss', 'refer': 'mobilewwwqqcom', 'otype': 'jsonp', 'ext_data': 'all', 'srcfrom': 'newsapp', 'callback': 'getNewsContentOnlyOutput' } response = Req(url=content_api, headers=headers, params=params).get_select() data = eval("'" + response.content.decode('ascii') + "'") pattern_item = re.compile(r'.*?"title":"(.*?)",.*?"pubtime":"(.*?)",.*?$',re.S) info = re.match(pattern_item, data).group(1,2) item = dict() item['link'] = url item['title'] = info[0] item['datetime'] = info[1] if item['title'] is not None: yield item else: Logger().setLogger(tc.log_path, 2, "Get B2 class detail page info failed, title is None")
def first_requests(self): response = Req(self.url).get_select() selector = etree.HTML(response.text) partA = selector.cssselect( 'body > div.area.areabg1 > div:nth-child(2) > div > div.tabContents.active > table > tr' ) partA.pop(0) for part in partA: item = dict() item['link'] = part.xpath('td[1]/a/@href')[0] item['title'] = part.xpath('td[1]/a/text()')[0] item['hot'] = part.xpath('td[2]/text()')[0] item['hot'] = round(int(item['hot']) / 10000, 2) yield item partB = selector.cssselect( 'body > div.area.areabg1 > div:nth-child(6) > div > div:nth-child(3) > table > tr' ) partB.pop(0) for part in partB: item = dict() item['link'] = part.xpath('td[1]/a/@href')[0] item['title'] = part.xpath('td[1]/a/text()')[0] item['hot'] = part.xpath('td[2]/text()')[0] item['hot'] = round(int(item['hot']) / 10000, 2) yield item
def first_requests(self): response = Req(self.url).get_select() response.encoding = 'utf-8' #Important!! selector = etree.HTML(response.text) hrefs = selector.xpath('//*[@id="focusListNews"]//a') url_set = [] for href in hrefs: if not href in url_set: url_set.append(href) #去重操作 item = dict() item['title'] = xpath_out(href.xpath('text()')) item['link'] = xpath_out(href.xpath('@href')) if item['title'] != None: if re.match(self.re_title, item['title']): yield item
def first_requests(self): selector = etree.HTML(Req(self.url).get_select().text) sections = selector.xpath('//*[@id="tab-news-01"]/ul/li/a') for section in sections: item = dict() item['title'] = xpath_out(section.xpath('text()')) item['link'] = xpath_out(section.xpath('@href')) if item['title'] is not None: yield item
def first_requests(self): response = Req(self.url).get_select() topics = json.loads(response.content.decode('utf8'))['data']['bang_topic']['topic_list'] for topic in topics: item = dict() item['title'] = topic['topic_name'] item['link'] = topic['topic_url'].replace("&", "&") item['hot'] = topic['discuss_num'] item['hot'] = round(int(item['hot']) / 10000, 2) yield item
def second_requests(self): #第二次请求详情页 for url in self.A_urls: #A类url直接请求,得到数据 try: selector = etree.HTML(Req(url).get_select().content) item = dict() item['link'] = url item['title'] = selector.xpath('//*[@id="Main-Article-QQ"]/div/div[1]/div[1]/div[1]/h1/text()') item['datetime'] = selector.xpath('//*[@id="Main-Article-QQ"]/div/div[1]/div[1]/div[1]/div/div[1]/span[3]/text()') if item['title'] != [] and item['datetime'] != []: item['title'] = item['title'][0] item['datetime'] = item['datetime'][0] elif item['title'] != [] and item['datetime'] == []: item['title'] = item['title'][0] item['datetime'] = selector.xpath('//*[@id="Main-Article-QQ"]/div/div[1]/div[1]/div[1]/div/div[1]/span[2]/text()')[0] else: item['title'] = selector.xpath('//*[@id="Main-Article-QQ"]/div[2]/div[1]/div[2]/div[1]/h1/text()')[0] item['datetime'] = selector.xpath('//*[@id="Main-Article-QQ"]/div[2]/div[1]/div[2]/div[1]/div/div[1]/span[3]/text()')[0] yield item except: Logger().setLogger(tc.log_path, 2, "Failed to get A class detail page info,url is" + url) pass for url in self.B_urls: #B类 try: response = Req(url).get_select() selector = etree.HTML(response.text) data = selector.xpath('/html/head/script[5]/text()') if data: #B类中部分js渲染的页面 item = dict() data = json.loads(data[0].strip()[14:]) item['link'] = url item['title'] = data['title'] item['datetime'] = data['pubtime'] yield item else: #B类中全部js渲染的页面 self.third_requests(url) except: Logger().setLogger(tc.log_path, 2, "Get B class detail page info failed,url is" + url) pass print("Second Time Requests Finished")
def second_requests(self, urls): for url in urls['A']: response = Req(url=url, proxy=True).get_select() if response is not None: selector = etree.HTML(response.content) item = dict() item['title'] = css_out( selector.cssselect('.vxp-media__body h1')).text item['link'] = response.url yield item for url in urls['B']: response = Req(url=url, proxy=True).get_select() if response is not None: selector = etree.HTML(response.content) item = dict() item['title'] = css_out(selector.cssselect('.story-body__h1')) if item['title'] != None: item['title'] = item['title'].text item['link'] = response.url yield item
def first_requests(self): response = Req(self.url).get_select() if response is not None: selector = etree.HTML(response.text) parts = selector.xpath('//div[@id="newloadmore"]/div') for part in parts: item = dict() item['link'] = xpath_out(part.xpath('a/@href')) item['title'] = xpath_out(part.xpath('a/div[1]/text()')) yield item
def first_requests(self): self.params['top_time'] = str(datetime.date.today()).replace("-","") response = Req(url=self.url, params=self.params).get_select() data = json.loads(response.content[10:-2].decode('utf8'))['data'] for news in data: item = dict() item['title'] = news['title'] item['link'] = news['url'] item['datetime'] = time.strptime(news['time'][:-6], '%a, %d %b %Y %H:%M:%S') item['datetime'] = time.strftime('%Y-%m-%d %H:%M:%S', item['datetime']) yield item
def first_requests(self): response = Req(self.url).get_select() selector = etree.HTML(response.text) part = selector.xpath('//div[@id="firehoselist"]/article') for article in part: item = dict() item['title'] = xpath_out(article.xpath('header/h2/span[1]/a/text()')) item['link'] = xpath_out(article.xpath('header/h2/span[1]/a/@href')) item['datetime'] = xpath_out(article.xpath('header/div[@class="details"]/span[2]/time/text()')).replace("@","") item['datetime'] = str(datetime.strptime(item['datetime'], 'on %A %B %d, %Y %I:%M%p')) item['link'] = "https://" + item['link'] yield item
def first_requests(self): response = Req(url=self.url, proxy=True).get_select() selector = etree.HTML(response.text) articles = selector.xpath( '//section[@class="layout-economist-today"]/div//div[@class="teaser__text"]' ) for article in articles: item = dict() item['title'] = xpath_out(article.xpath('h3/a/span/text()')) item['link'] = "https://www.economist.com" + xpath_out( article.xpath('h3/a/@href')) yield item
def first_requests(self): response = Req(self.url).get_select() selector = etree.HTML(response.content) detail_urls = [] partA = selector.xpath('/html/body/div[3]/div[2]/div/div[1]/div[2]/div[2]/div[1]/dl/dt/a/@href') for part in partA: detail_urls.append(part) partB = selector.xpath('//div[@class="secNewsBlock"]/div[@class="secNewsList"]//p/a/@href') for part in partB: detail_urls.append(part) print(detail_urls) return detail_urls
def first_requests(self): response = Req(self.url_explore).get_select() selector = etree.HTML(response.text) gallery = selector.xpath( '//*[@id="gallery_main_frame"]/div[@class="item"]') for g in gallery: item = dict() item['link'] = xpath_out( g.xpath('div[@class="bd"]/div/div[@class="title"]/a/@href')) item['title'] = xpath_out( g.xpath('div[@class="bd"]/div/div[@class="title"]/a/text()')) item['image'] = str( xpath_out( g.xpath( 'div[@class="bd"]/div[@class="pic"]/a/@style')))[21:-1] item['intro'] = xpath_out( g.xpath('div[@class="bd"]/div/p/a/text()')) if item['title']: yield item home = Req(self.url_home).get_select() selector = etree.HTML(home.text) side = selector.xpath( '//*[@id="anony-sns"]/div/div[2]/div[2]/ul/div/ul/li') for s in side: item = dict() item['link'] = xpath_out(s.xpath('a/@href')) item['title'] = xpath_out(s.xpath('a/text()')) item['hot'] = xpath_out(s.xpath('span/text()')) item['hot'] = re.match(self.hot_pattern, str(item['hot'])) if item['hot'] is not None: item['hot'] = item['hot'].group(1) if item['title']: yield item
def first_requests(self): response = Req(self.url).get_select() selector = etree.HTML(response.content) detail_urls = [] partA = selector.xpath( '/html/body/div[2]/div[5]/div/div[1]/div[2]/div[2]/dl//a/@href') for part in partA: detail_urls.append(part) partB = selector.xpath( '/html/body/div[2]/div[5]/div/div[1]/div[3]/div[3]/div//a/@href') for part in partB: detail_urls.append(part) return detail_urls
def second_requests(self, detail_urls): for detail_url in detail_urls: response = Req(detail_url).get_select() selector = etree.HTML(response.text) item = dict() item['link'] = detail_url item['title'] = xpath_out( selector.xpath( '//*[@id="article-container"]/div[2]/div[1]/div[1]/div[1]/h1/text()' )) item['datetime'] = xpath_out( selector.xpath('//*[@id="news-time"]/text()')) if item['title'] != None: yield item
def first_requests(self): response = Req(self.url).get_select() selector = etree.HTML(response.text) part = selector.xpath( '//div[@class="archive-listing-component"]/div[1]/ul/li') for article in part: item = dict() item['link'] = "https://www.wired.com" + xpath_out( article.xpath( 'div[@class="archive-item-component__info"]/a/@href') ).encode('utf8').decode('utf8') item['title'] = xpath_out( article.xpath( 'div[@class="archive-item-component__info"]/a/h2/text()')) yield item
def first_requests(self): response = Req(self.url).get_select() selector = etree.HTML(response.content) hrefs = selector.xpath('//*[@id="hpart2L"]//a') url_set = [] for href in hrefs: if not href in url_set: url_set.append(href) #去重操作 item = dict() item['title'] = xpath_out(href.xpath('text()')) item['link'] = xpath_out(href.xpath('@href')) if item['title'] != None: if re.match(self.re_title, item['title']): yield item
def first_requests(self): selector = etree.HTML(Req(self.url).get_select().text) partA = selector.xpath( '//div[@class="focus-news-box"]/div[@class="news"]/p') partB = selector.xpath( '//div[@class="focus-news-box"]/div[3]/div/ul/li') for part in partA: item = dict() item['title'] = xpath_out(part.xpath('a/@title')) item['link'] = xpath_out(part.xpath('a/@href')) yield item for part in partB: item = dict() item['title'] = xpath_out(part.xpath('a/@title')) item['link'] = xpath_out(part.xpath('a/@href')) yield item
def first_requests(self): response = Req(self.url).get_select() if response is not None: selector = etree.HTML(response.text) partA = selector.xpath('/html/body/div[5]/div[3]/div[2]/dl') partB = selector.xpath('/html/body/div[9]/div[2]/div/dl') for part in partA: item = dict() item['link'] = xpath_out(part.xpath('dd/h3/a/@href')) item['title'] = xpath_out(part.xpath('dd/h3/a/text()')) yield item for part in partB: item = dict() item['link'] = xpath_out(part.xpath('dd/h3/a/@href')) item['title'] = xpath_out(part.xpath('dd/h3/a/text()')) yield item
def first_requests(self): response = Req(url=self.url, proxy=True).get_select() selector = etree.HTML(response.text) articles = selector.xpath('//div[@class="teaser-list"]/article') for article in articles: item = dict() item['title'] = xpath_out( article.xpath('a/div[2]/h3/span[2]/text()')) item['link'] = "https://www.economist.com" + xpath_out( article.xpath('a/@href')) item['datetime'] = xpath_out( article.xpath( 'a/div[2]/div[@class="teaser__datetime"]/time/@datetime') ).encode('utf8').decode('utf8') item['datetime'] = str( datetime.strptime(item['datetime'], '%Y-%m-%dT%H:%M:%SZ')) yield item
def first_requests(self): response = Req(self.url).get_select() selector = etree.HTML(response.content) detail_urls = [] partA = selector.xpath( '/html/body/div[4]/div[2]/div[1]/div/div[2]/div/div/div[1]//a/@href' ) partB = selector.xpath( '/html/body/div[4]/div[2]/div[1]/div/div[2]/div/div/div[2]/div//a/@href' ) for part in partA: detail_urls.append(part) for part in partB: if not re.match(self.pattern_img, part): detail_urls.append(part) return detail_urls
def first_requests(self): response = Req(self.url, proxy=True).get_select() selector = etree.HTML(response.text) partA = selector.xpath('//div[@class="css-1h4m9oq"]/article') print(partA) partB = selector.xpath('//div[@class="css-1h4m9oq"]/ul/li') print(partB) for part in partA: item = dict() item['title'] = xpath_out(part.xpath('a[2]/div/h2/text()')) item['link'] = xpath_out(part.xpath('a[2]/@href')) yield item for part in partB: item = dict() item['title'] = xpath_out(part.xpath('a/div/h1/text()')) item['link'] = xpath_out(part.xpath('a/@href')) yield item
def first_requests(self): response = Req(self.url, proxy=True).get_select() selector = etree.HTML(response.text) partA = selector.xpath( '//main[@id="site-content"]/div/div[2]/div[2]/div[1]/div/article') partB = selector.xpath( '//main[@id="site-content"]/div/div[2]/div[2]/div[1]/div/ul/li') for part in partA: item = dict() item['title'] = xpath_out(part.xpath('a[2]/div/h2/text()')) item['link'] = xpath_out(part.xpath('a[2]/@href')) yield item for part in partB: item = dict() item['title'] = xpath_out(part.xpath('a/div/h1/text()')) item['link'] = xpath_out(part.xpath('a/@href')) yield item
def first_requests(self): #第一次请求首页以获取详情页url selector = etree.HTML(Req(self.url).get_select().content) uls = selector.xpath('//*[@id="tab-news-01"]/ul') try: for ul in uls: lis = ul.xpath('li') for li in lis: hrefs = li.xpath('a/@href') for href in hrefs: #对所有的url进行分类A或B if re.match(self.pattern_a, href): self.A_urls.append(href) elif re.match(self.pattern_b, href): self.B_urls.append(href) else: pass print("First Time Requests Succeed") except: Logger().setLogger(tc.log_path, 4, "Failed to get detail_page_urls") pass
def first_requests(self): for url in self.urls: response = Req(url, proxy=True).get_select() selector = etree.HTML(response.text) partA = selector.xpath('//div[@class="partial hero"]/article') partB = selector.xpath('//div[@class="partial marquee"]/article') for part in partA: item = dict() item['title'] = xpath_out(part.xpath('div/h3/a/text()')).strip() item['link'] = "https://time.com" + xpath_out(part.xpath('div/h3/a/@href')) yield item for part in partB: item = dict() item['title'] = xpath_out(part.xpath('div/h3/a/text()')).strip() item['link'] = "https://time.com" + xpath_out(part.xpath('div/h3/a/@href')) yield item
def first_requests(self): response = Req(url=self.url,cookies=self.cookies).get_select() selector = etree.HTML(response.text) sections = selector.xpath('//*[@class="HotList-list"]//section') for section in sections: item = dict() item['title'] = xpath_out(section.xpath('div[2]/a/h2/text()')) item['link'] = xpath_out(section.xpath('div[2]/a/@href')) item['hot'] = float(xpath_out(section.xpath('div[2]/div/text()'))[:-3]) if item['title'] is not None: if item['hot'] <= 150: item['home'] = False yield item else: Logger().setLogger(zh.log_path, 2, "Item's title is None, item is " + item) pass