def parse_page(self,response): data = response.body_as_unicode() if data != []: items = [] data = json.loads(data) for i in data: item = EleIndustryItem() Abstract = i['Content'] ImageUrl = i['ImageUrl'] title = i['Subject'] id = i['Itemid'] TimeDes = i['TimeDes'] TagList = i['TagList'] url1 = urljoin('https://m.hqew.com', '/news/' + str(id)) #tags = [j['TagName'] for j in TagList] #tags = [ j['TagName'].strip() for j in TagList if j['TagName'] is not None] #keywords = ','.join(tags) tags = [] for i in TagList: if i['TagName'] is not None: tags.append(i['TagName']) keywords = ','.join(tags) time1 = re.findall(r'\d+-\d+-\d+',TimeDes)[0] + ' ' + Get_Time() item['Abstract'] = Abstract item['News_Title'] = title item['News_Dt'] = time1 item['URL'] = url1 item['Keywords'] = keywords item['Image_URL'] = ImageUrl items.append(item) for item in items: yield scrapy.Request(item['URL'],callback=self.parse_detail,meta={'meta_1':item})
def parse(self, response): nodes = response.xpath('//div[@class="cp2liststy"]') items = [] if nodes != []: for node in nodes: item = EleIndustryItem() title = node.xpath('./h4/a/@title').extract_first() url = node.xpath('./h4/a/@href').extract_first() time1 = node.xpath('./em/text()').extract_first() time1 = re.findall(r'\d+-\d+-\d+', time1)[0] time1 = time1 + ' ' + Get_Time() keywords = node.xpath('./span/a').xpath('string(.)').extract() # keywords = ','.join(keywords) Abstract = node.xpath('./div[2]/p/text()').extract_first() item['Abstract'] = Abstract item['News_Title'] = title item['News_Dt'] = time1 item['URL'] = url items.append(item) for item in items: yield scrapy.Request(item['URL'], callback=self.parse_detail, meta={'meta_1': item}, dont_filter=True)
def parse_detail(self, response): content = response.xpath('//div[@class="contentlist"]') if content != []: item = EleIndustryItem() content = content.extract_first() title = response.xpath('//h1/text()') if title != []: title = title.extract_first().strip() node1 = response.xpath('//div[@class="newstitle"]/span/text()') if node1 != []: node2 = node1.extract_first() time_1 = re.findall(r'\d+-\d+-\d+', node2) if time_1 != []: time1 = time_1[0] + ' ' + Get_Time() try: author = node2.split(',')[1].strip() except Exception as E: author = '' node3 = response.xpath( '//div[@class="newstitle"]/span/a/text()') keywords = '' if node3 != []: keywords = node3.extract_first() item['News_Title'] = title item['News_Dt'] = time1 item['Author'] = author item['Keywords'] = keywords str_time = '<div class="explain"><span>' + item[ 'Author'] + '</span><time>' + str( item['News_Dt'].split( ' ')[0]) + '</time></div>' content1 = '<h1>' + item[ 'News_Title'] + '</h1>' + str_time + "<div class='content'>" + content + "</div>" data = change_content(content1, 'http://www.ic37.com') item['Content'] = data[0] img_list = data[1] get_pic(item, img_list) item['Update_Tm'] = get_time_stamp() item['Abstract'] = '' item['URL'] = response.url item['Web_Id'] = '5-37' #print(item) yield item
def parse_detail(self, response): content = response.xpath('//div[@class="article-body"]') if content != []: item = EleIndustryItem() content = content.extract_first() title = response.xpath('//h1[@class="headline"]/text()') if title != []: title = title.extract_first().strip() # print(title) node1 = response.xpath( '//div[@class="muted subline"]//span[@class="mr20"]/text()' ) if node1 != []: node2 = node1.extract() time1 = node2[0].strip().split(' ')[0] + ' ' + Get_Time() author = '' if len(node2) == 2: author = node2[1].strip() node3 = response.xpath( '//div[@class="overhide overhidden new-tags"]/a/text()' ) keywords = '' if node3 != []: keywords = ','.join(node3.extract()) item['News_Title'] = title item['News_Dt'] = time1 item['Author'] = author item['Keywords'] = keywords str_time = '<div class="explain"><span>' + item[ 'Author'] + '</span><time>' + str( item['News_Dt'].split(' ')[0]) + '</time></div>' content1 = '<h1>' + item[ 'News_Title'] + '</h1>' + str_time + "<div class='content'>" + content + "</div>" item['Content'] = content1 content2 = etree.HTML(item['Content']) img_list = content2.xpath('//img/@src') get_pic(item, img_list) item['Update_Tm'] = get_time_stamp() item['Abstract'] = '' item['URL'] = response.url item['Web_Id'] = '5-31' #print(item) yield item
def parse_item1(self, response): # print(response.url) # print(response.url) content = response.xpath('//*[@id="art_body"]') if content != []: content = content.extract_first() title = response.xpath('//h1') if title != []: item = EleIndustryItem() title = title.xpath('string(.)').extract_first().strip() time1 = response.xpath('//span[@class="time"]') if time1 != []: time1 = re.findall(r'\d+-\d+-\d+', time1.extract_first()) if time1 != []: time1 = time1[0] + ' ' + Get_Time() author = response.xpath( '//span[@class="zuozhe"]').xpath( 'string(.)').extract_first() if author == '作者:': author = '' Keywords = response.xpath( '//span[@class="mbx"]/text()').extract_first() item['News_Title'] = title item['News_Dt'] = time1 item['Author'] = author item['Keywords'] = Keywords str_time = '<div class="explain"><span>' + item[ 'Author'] + '</span><time>' + str( item['News_Dt'].split( ' ')[0]) + '</time></div>' content1 = '<h1>' + item[ 'News_Title'] + '</h1>' + str_time + "<div class='content'>" + content + "</div>" item['Content'] = content1 content2 = etree.HTML(item['Content']) img_list = content2.xpath('//img/@src') get_pic(item, img_list) item['Update_Tm'] = get_time_stamp() item['Abstract'] = '' item['URL'] = response.url item['Web_Id'] = '5-34' #print(item) yield item
def parse_item(self,response): content = response.xpath('//*[@id="articlebody"]') if content != []: item = EleIndustryItem() content = content.extract_first() node = re.findall(r'<center>.*?</center>', content, re.S) if node != []: content = content.replace(node[0],'') title = response.xpath('//*[@id="articledetail"]/h1/text()') if title != []: title = title.extract_first().strip() node1 = response.xpath('//*[@id="articledetail"]/p/span[2]/text()') if node1 != []: node2 = node1.extract_first().split('\xa0\xa0') time1 = re.findall(r'\d+-\d+-\d+',str(node1.extract_first)) if time1 != []: time1 = time1[0]+ ' ' + Get_Time() author = re.findall(r'作者:(.*)',node2[1]) author1 = '' if author != []: if author[0].strip() != 'n': author1 = author[0].strip() node3 = response.xpath('//*[@id="navigation"]/p/a[3]/text()') keywords = node3.extract_first() item['News_Title'] = title item['News_Dt'] = time1 item['Author'] = author1 item['Keywords'] = keywords str_time = '<div class="explain"><span>' + item['Author'] + '</span><time>' + str( item['News_Dt'].split(' ')[0]) + '</time></div>' content1 = '<h1>' + item[ 'News_Title'] + '</h1>' + str_time + "<div class='content'>" + content + "</div>" item['Content'] = content1 content2 = etree.HTML(item['Content']) img_list = content2.xpath('//img/@src') get_pic(item, img_list) item['Update_Tm'] = get_time_stamp() item['Abstract'] = '' item['URL'] = response.url item['Web_Id'] = '5-33' #print(item) yield item
def parse_item1(self, response): content = response.xpath('//div[@class="content"]') if content != []: item = EleIndustryItem() content = content.extract_first() title = response.xpath('//h2/text()') if title != []: title = title.extract_first() time1 = response.xpath( '//div[@class="title-box"]/p/span[1]/text()') if time1 != []: time1 = time1.extract_first().split( ':')[1] + ' ' + Get_Time() author = response.xpath( '//div[@class="title-box"]/p/span[2]/text()') author1 = '' if author != []: author1 = author.extract_first().split(':')[1] if author1.endswith('.com'): author1 = '' item['News_Title'] = title item['News_Dt'] = time1 item['Author'] = author1 item['Keywords'] = '' str_time = '<div class="explain"><span>' + item[ 'Author'] + '</span><time>' + str( item['News_Dt'].split(' ')[0]) + '</time></div>' content1 = '<h1>' + item[ 'News_Title'] + '</h1>' + str_time + "<div class='content'>" + content + "</div>" data = change_content(content1, self.start_urls[0]) item['Content'] = data[0] # content2 = etree.HTML(item['Content']) img_list = data[1] get_pic(item, img_list) item['Update_Tm'] = get_time_stamp() item['Abstract'] = '' item['URL'] = response.url item['Web_Id'] = '5-26' yield item
def parse(self,response): nodes = response.xpath('//div[@class="con-details"]') items = [] if nodes != []: for node in nodes: item = EleIndustryItem() title = node.xpath('.//h3/a/text()').extract_first() url = node.xpath('.//h3/a/@href').extract_first() Abstract = node.xpath('.//p/span/text()').extract_first() time1 = node.xpath('.//span[@class="fb-tl"]/text()').extract_first() time1 = re.findall(r'\d+-\d+-\d+',time1)[0] time1 = time1 + ' ' + Get_Time() keywords = node.xpath('//span[@class="fb-tl"]/a/text()').extract_first() item['Abstract'] = Abstract item['News_Title'] = title item['News_Dt'] = time1 item['URL'] = url item['Keywords'] = keywords items.append(item) for item in items: yield scrapy.Request(item['URL'],callback=self.parse_detail,meta={'meta_1':item})
def parse_item(self, response): content = response.xpath('//*[@id="contentDiv"]') if content != []: content = content.extract_first() title = response.xpath('//h1/text()') if title != []: title = title.extract_first() time1 = response.xpath( '//div[@class="authorTimeSource"]/span[2]/text()') if time1 != []: item = EleIndustryItem() time1 = time1.extract_first().split(':')[1] time1 = time1 + ' ' + Get_Time() author = response.xpath( '//div[@class="authorTimeSource"]/span[3]/text()' ).extract_first() author = author.split(':')[1] tags = response.xpath('//div[@class="keyWord"]//em').xpath( 'string(.)').extract() key = ','.join(tags) item['News_Title'] = title item['News_Dt'] = time1 item['Author'] = author item['Keywords'] = key str_time = '<div class="explain"><span>' + item[ 'Author'] + '</span><time>' + str( item['News_Dt'].split(' ')[0]) + '</time></div>' content1 = '<h1>' + item[ 'News_Title'] + '</h1>' + str_time + "<div class='content'>" + content + "</div>" data = change_content(content1, self.start_urls[0]) item['Content'] = data[0] img_list = data[1] get_pic(item, img_list) item['Update_Tm'] = get_time_stamp() item['Abstract'] = '' item['URL'] = response.url item['Web_Id'] = '5-27' yield item
def parse_page1(self,response): nodes = response.xpath('//ul[@class="news-items"]/li') items = [] for node in nodes: title = node.xpath('.//h3/a/text()').extract_first() url = node.xpath('.//h3/a/@href').extract_first() num = re.findall(r'\d+',url) url1 = urljoin('https://m.hqew.com','/news/'+str(num[0])) Abstract = node.xpath('.//p/text()').extract_first().strip() tags = node.xpath('.//div[@class="news-item-tag"]/a/text()').extract() keywords = ','.join(tags) time1 = node.xpath('.//div[@class="news-item-time"]/text()').extract_first() time1 = re.findall(r'\d+-\d+-\d+', time1)[0] + ' ' + Get_Time() ImageUrl = node.xpath('.//img/@src').extract_first() item = EleIndustryItem() item['Abstract'] = Abstract item['News_Title'] = title item['News_Dt'] = time1 item['URL'] = url1 item['Keywords'] = keywords item['Image_URL'] = ImageUrl items.append(item) for item in items: yield scrapy.Request(item['URL'],callback=self.parse_detail,meta={'meta_1':item})
def parse_item1(self, response): content = response.xpath('//div[@class="detailcon"]') if content != []: item = EleIndustryItem() content = content.extract_first() title = response.xpath('//div[@class=" detailtitle"]/text()') if title != []: title = title.extract_first().strip() node = response.xpath('//div[@class="detailintro"]').xpath( 'string(.)') if node != []: node = node.extract_first() node = node.split('|') author = node[0] time1 = re.compile(r'\d+-\d+-\d+', re.S).findall(node[1]) if time1 != []: time1 = time1[0] + ' ' + Get_Time() item['News_Title'] = title item['News_Dt'] = time1 item['Author'] = author item['Keywords'] = '' str_time = '<div class="explain"><span>' + item[ 'Author'] + '</span><time>' + str( item['News_Dt'].split( ' ')[0]) + '</time></div>' content1 = '<h1>' + item[ 'News_Title'] + '</h1>' + str_time + "<div class='content'>" + content + "</div>" data = change_content(content1, self.start_urls[0]) item['Content'] = data[0] img_list = data[1] get_pic(item, img_list) item['Update_Tm'] = get_time_stamp() item['Abstract'] = '' item['URL'] = response.url item['Web_Id'] = '5-27' yield item
def parse_item(self, response): content = response.xpath('//div[@class="simditor-body clearfix"]') if content == []: content1 = response.xpath('//div[@class="pct"]') if content1 != []: content1 = content1.extract_first() time1 = response.xpath( '//div[@class="bar_tip float_l"]/em/span/@title') if time1 == []: time1 = response.xpath( '//div[@class="bar_tip float_l"]/em/text()') if time1 != []: time1 = time1.extract_first() time1 = re.findall(r'(\d+-\d+-\d+)', time1) if time1 != []: title = response.xpath( '//*[@id="thread_subject"]/text()') if title != []: item = EleIndustryItem() title = title.extract_first() item['News_Title'] = title time1 = time1[0] time1 = time1 + ' ' + Get_Time() item['News_Dt'] = time1 author = response.xpath( '//div[@class="bar_tip float_l"]/div/span/a/span/text()' ) author1 = '' if author != []: author1 = author.extract_first() item['Author'] = author1 str_time = '<div class="explain"><span>' + item[ 'Author'] + '</span><time>' + str( item['News_Dt'].split( ' ')[0]) + '</time></div>' content1 = '<h1>' + item[ 'News_Title'] + '</h1>' + str_time + "<div class='content'>" + content1 + "</div>" data = change_content(content1, self.start_urls[0]) item['Content'] = data[0] # content2 = etree.HTML(item['Content']) img_list = data[1] get_pic(item, img_list) item['Update_Tm'] = get_time_stamp() item['Abstract'] = '' item['Keywords'] = '' item['URL'] = response.url item['Web_Id'] = '5-24' #print(item) yield item if content1 == []: content2 = response.xpath('//div[@class="article-content"]') content3 = response.xpath('//div[@class="author_des"]') if content2 != []: content2 = content2.extract_first() title = response.xpath('//h1/text()') if title != []: title = title.extract_first() time1 = response.xpath( '//div[@class="fl"]/em[3]/text()') if time1 != []: item = EleIndustryItem() time1 = time1.extract_first().replace( '年', '-').replace('月', '-').replace('日', '') time1 = time1.strip().split(' ')[0] time1 = time1 + ' ' + Get_Time() author = response.xpath( '//div[@class="fl"]/em[1]/text()') author1 = '' if author != []: author1 = author.extract_first() tag = response.xpath( '//div[@class="tag"]//span').xpath( 'string(.)').extract() tags = ','.join(tag) item['News_Title'] = title item['Author'] = author1 item['News_Dt'] = time1 item['Keywords'] = tags str_time = '<div class="explain"><span>' + item['Author'] + '</span><time>' + \ item['News_Dt'].split(' ')[0] + '</time></div>' content_1 = '<h1>' + item[ 'News_Title'] + '</h1>' + str_time + "<div class='content'>" + content2 + "</div>" data = change_content(content_1, self.start_urls[0]) item['Content'] = data[0] # content2 = etree.HTML(item['Content']) img_list = data[1] get_pic(item, img_list) item['Update_Tm'] = get_time_stamp() item['Abstract'] = '' item['URL'] = response.url item['Web_Id'] = '5-24' #print(item) yield item if content3 != []: content3 = content3.extract_first() title = response.xpath('//h1/text()') if title != []: title = title.extract_first() span_node = response.xpath( '//span[@class="float_left font-small color_gray"]' ).xpath('string(.)').extract_first() span_text = span_node.split(' ') if len(span_text) == 3: item = EleIndustryItem() author = span_text[0].replace('\r', '').replace( '\n', '').replace('\t', '').replace('发表于', '') time1 = span_text[1] + ' ' + Get_Time() tag = response.xpath( '//ul[@class="article_tags clearfix"]/li/span' ).xpath('string(.)').extract() tags = ','.join(tag) item['News_Title'] = title item['Author'] = author item['News_Dt'] = time1 item['Keywords'] = tags str_time = '<div class="explain"><span>' + item['Author'] + '</span><time>' + \ item['News_Dt'].split(' ')[0] + '</time></div>' content_1 = '<h1>' + item[ 'News_Title'] + '</h1>' + str_time + "<div class='content'>" + content3 + "</div>" data = change_content(content_1, self.start_urls[0]) item['Content'] = data[0] # content2 = etree.HTML(item['Content']) img_list = data[1] get_pic(item, img_list) item['Update_Tm'] = get_time_stamp() item['Abstract'] = '' item['URL'] = response.url item['Web_Id'] = '5-24' #print(item) yield item if content != []: content = content.extract_first() title = response.xpath('//h1/text()') if title != []: title = title.extract_first() time1 = response.xpath('//section//span[@class="time"]/text()') if time1 != []: item = EleIndustryItem() item['News_Title'] = title time1 = time1.extract_first().replace('年', '-').replace( '月', '-').replace('日', '') time1 = time1.strip().split(' ')[0] time1 = time1 + ' ' + Get_Time() item['News_Dt'] = time1 author2 = response.xpath( '//div[@class="article-info art-share-layout m-share-layout clearfix"]/a' ).xpath('string(.)').extract() author = '电子发烧友网' if author2 != []: author = author2[0] if author == '': author = '电子发烧友网' if author2 == ['']: uid = response.xpath('//input[@id="webMID"]/@value') if uid != []: uid = uid.extract_first() url = 'http://www.elecfans.com/webapi/member/getUserInfoNew/uid/{}'.format( str(uid)) data = requests.get(url).text try: data = json.loads(data)['data']['writer_uname'] author = data except Exception as E: pass item['Author'] = author tags = response.xpath( '//ul[@class="hot-main clearfix"]/li/text()').extract( ) tag = '' if tags != []: tag = ','.join(tags).replace('\n', '').replace( '\r', '').replace(' ', '').replace(',,', ',').strip(',') item['Keywords'] = tag str_time = '<div class="explain"><span>' + item['Author'] + '</span><time>' + \ item['News_Dt'].split(' ')[0] + '</time></div>' content1 = '<h1>' + item[ 'News_Title'] + '</h1>' + str_time + "<div class='content'>" + content + "</div>" data = change_content(content1, self.start_urls[0]) item['Content'] = data[0] # content2 = etree.HTML(item['Content']) img_list = data[1] get_pic(item, img_list) item['Update_Tm'] = get_time_stamp() item['Abstract'] = '' item['URL'] = response.url item['Web_Id'] = '5-24' #print(item) yield item
def parse_item2(self, response): content = response.xpath('//div[@class="art-con article_body"]') if content != []: content = content.extract_first() title = response.xpath('//h1') if title != []: item = EleIndustryItem() title = title.xpath('string(.)').extract_first().strip() node1 = response.xpath('//div[@class="detailwarn"]') if node1 != []: data = node1.xpath('string(.)').extract_first() time1 = re.findall(r'\d+-\d+-\d+', data) author = re.findall(r'作者:(.*)', data) if time1 != []: time1 = time1[0] + ' ' + Get_Time() author1 = '' if author != []: author1 = author[0].strip() abstract = response.xpath( '//span[@class="art-lead-text"]/text()' ).extract_first() Keywords = response.xpath( '//div[@class="art-relative-tags"]/a/text()' ).extract() Keywords = ','.join(Keywords) item['News_Title'] = title item['News_Dt'] = time1 item['Author'] = author1 item['Keywords'] = Keywords item['Abstract'] = abstract str_time = '<div class="explain"><span>' + item[ 'Author'] + '</span><time>' + str( item['News_Dt'].split( ' ')[0]) + '</time></div>' content1 = '<h1>' + item[ 'News_Title'] + '</h1>' + str_time + "<div class='content'>" + content + "</div>" data = change_content(content1, 'https://www.eet-china.com') item['Content'] = data[0] img_url2 = '' img = response.xpath('//div[@class="cover-img"]/@style' ).extract_first() if img is not None: img = img.replace('(', '').replace(')', '') img_url = re.search(r'url(.*)', img) if img_url: img_url1 = img_url.group(1) img_url2 = urljoin('https://www.eet-china.com', img_url1) img_list = data[1] if img_list == []: if img_url2 != '': img_list.append(img_url2) get_pic(item, img_list) item['Update_Tm'] = get_time_stamp() item['URL'] = response.url item['Web_Id'] = '5-38' #print(item) yield item