def parse_detail(self, response): content = response.xpath('//div[@class="contentlist"]') if content != []: item = EleIndustryItem() content = content.extract_first() title = response.xpath('//h1/text()') if title != []: title = title.extract_first().strip() node1 = response.xpath('//div[@class="newstitle"]/span/text()') if node1 != []: node2 = node1.extract_first() time_1 = re.findall(r'\d+-\d+-\d+', node2) if time_1 != []: time1 = time_1[0] + ' ' + Get_Time() try: author = node2.split(',')[1].strip() except Exception as E: author = '' node3 = response.xpath( '//div[@class="newstitle"]/span/a/text()') keywords = '' if node3 != []: keywords = node3.extract_first() item['News_Title'] = title item['News_Dt'] = time1 item['Author'] = author item['Keywords'] = keywords str_time = '<div class="explain"><span>' + item[ 'Author'] + '</span><time>' + str( item['News_Dt'].split( ' ')[0]) + '</time></div>' content1 = '<h1>' + item[ 'News_Title'] + '</h1>' + str_time + "<div class='content'>" + content + "</div>" data = change_content(content1, 'http://www.ic37.com') item['Content'] = data[0] img_list = data[1] get_pic(item, img_list) item['Update_Tm'] = get_time_stamp() item['Abstract'] = '' item['URL'] = response.url item['Web_Id'] = '5-37' #print(item) yield item
def parse_item1(self, response): content = response.xpath('//div[@class="content"]') if content != []: item = EleIndustryItem() content = content.extract_first() title = response.xpath('//h2/text()') if title != []: title = title.extract_first() time1 = response.xpath( '//div[@class="title-box"]/p/span[1]/text()') if time1 != []: time1 = time1.extract_first().split( ':')[1] + ' ' + Get_Time() author = response.xpath( '//div[@class="title-box"]/p/span[2]/text()') author1 = '' if author != []: author1 = author.extract_first().split(':')[1] if author1.endswith('.com'): author1 = '' item['News_Title'] = title item['News_Dt'] = time1 item['Author'] = author1 item['Keywords'] = '' str_time = '<div class="explain"><span>' + item[ 'Author'] + '</span><time>' + str( item['News_Dt'].split(' ')[0]) + '</time></div>' content1 = '<h1>' + item[ 'News_Title'] + '</h1>' + str_time + "<div class='content'>" + content + "</div>" data = change_content(content1, self.start_urls[0]) item['Content'] = data[0] # content2 = etree.HTML(item['Content']) img_list = data[1] get_pic(item, img_list) item['Update_Tm'] = get_time_stamp() item['Abstract'] = '' item['URL'] = response.url item['Web_Id'] = '5-26' yield item
def parse_item(self, response): content = response.xpath('//*[@id="contentDiv"]') if content != []: content = content.extract_first() title = response.xpath('//h1/text()') if title != []: title = title.extract_first() time1 = response.xpath( '//div[@class="authorTimeSource"]/span[2]/text()') if time1 != []: item = EleIndustryItem() time1 = time1.extract_first().split(':')[1] time1 = time1 + ' ' + Get_Time() author = response.xpath( '//div[@class="authorTimeSource"]/span[3]/text()' ).extract_first() author = author.split(':')[1] tags = response.xpath('//div[@class="keyWord"]//em').xpath( 'string(.)').extract() key = ','.join(tags) item['News_Title'] = title item['News_Dt'] = time1 item['Author'] = author item['Keywords'] = key str_time = '<div class="explain"><span>' + item[ 'Author'] + '</span><time>' + str( item['News_Dt'].split(' ')[0]) + '</time></div>' content1 = '<h1>' + item[ 'News_Title'] + '</h1>' + str_time + "<div class='content'>" + content + "</div>" data = change_content(content1, self.start_urls[0]) item['Content'] = data[0] img_list = data[1] get_pic(item, img_list) item['Update_Tm'] = get_time_stamp() item['Abstract'] = '' item['URL'] = response.url item['Web_Id'] = '5-27' yield item
def parse_detail(self,response): item = response.meta['meta_1'] content = response.xpath('//*[@id="newsInfo"]') if content != []: content = content.extract_first() item['Author'] = '华强资讯' item['URL'] = response.url str_time = '<div class="explain"><span>' + item['Author'] + '</span><time>' + str( item['News_Dt'].split(' ')[0]) + '</time></div>' content1 = '<h1>' + item[ 'News_Title'] + '</h1>' + str_time + "<div class='content'>" + content + "</div>" data = change_content(content1,'http://news.hqew.com') item['Content'] = data[0] if data[1]!=[]: data[1].pop() if data[1] == []: data[1].append(item['Image_URL']) img_list = data[1] get_pic(item, img_list) item['Update_Tm'] = get_time_stamp() item['Web_Id'] = '5-36' #print(item) yield item
def parse_item1(self, response): content = response.xpath('//div[@class="detailcon"]') if content != []: item = EleIndustryItem() content = content.extract_first() title = response.xpath('//div[@class=" detailtitle"]/text()') if title != []: title = title.extract_first().strip() node = response.xpath('//div[@class="detailintro"]').xpath( 'string(.)') if node != []: node = node.extract_first() node = node.split('|') author = node[0] time1 = re.compile(r'\d+-\d+-\d+', re.S).findall(node[1]) if time1 != []: time1 = time1[0] + ' ' + Get_Time() item['News_Title'] = title item['News_Dt'] = time1 item['Author'] = author item['Keywords'] = '' str_time = '<div class="explain"><span>' + item[ 'Author'] + '</span><time>' + str( item['News_Dt'].split( ' ')[0]) + '</time></div>' content1 = '<h1>' + item[ 'News_Title'] + '</h1>' + str_time + "<div class='content'>" + content + "</div>" data = change_content(content1, self.start_urls[0]) item['Content'] = data[0] img_list = data[1] get_pic(item, img_list) item['Update_Tm'] = get_time_stamp() item['Abstract'] = '' item['URL'] = response.url item['Web_Id'] = '5-27' yield item
def parse_item(self, response): content = response.xpath('//div[@class="article-bd"]') if content != []: item = EleIndustryItem() content = content.extract_first() title = response.xpath('//h1[@class="title-h1"]/text()') if title != []: title = title.extract_first().strip() time1_node = response.xpath('//div[@class="article-attr"]') if time1_node != []: time1 = time1_node.xpath('string(.)').extract_first() time1 = time1.replace('/', '-') time1 = GetTime(time1) if time1 is not None: author = '' keywords = response.xpath( '//span[@class="article-keywords"]/a/text()' ).extract() keywords = ','.join(keywords) try: content = change_content( content, '//div[@class="article-bd"]') except Exception as E: logging.error('img有错' + response.url) item['newsTitle'] = title item['newsDt'] = time1 item['author'] = author item['keywords'] = keywords item['updateTm'] = get_time_stamp() item['abstract'] = '' item['url'] = response.url item['content'] = content item['aid'] = md5_url(item['url']) item['bid'] = '1' yield item
def parse_item(self, response): content = response.xpath('//div[@class="simditor-body clearfix"]') if content == []: content1 = response.xpath('//div[@class="pct"]') if content1 != []: content1 = content1.extract_first() time1 = response.xpath( '//div[@class="bar_tip float_l"]/em/span/@title') if time1 == []: time1 = response.xpath( '//div[@class="bar_tip float_l"]/em/text()') if time1 != []: time1 = time1.extract_first() time1 = re.findall(r'(\d+-\d+-\d+)', time1) if time1 != []: title = response.xpath( '//*[@id="thread_subject"]/text()') if title != []: item = EleIndustryItem() title = title.extract_first() item['News_Title'] = title time1 = time1[0] time1 = time1 + ' ' + Get_Time() item['News_Dt'] = time1 author = response.xpath( '//div[@class="bar_tip float_l"]/div/span/a/span/text()' ) author1 = '' if author != []: author1 = author.extract_first() item['Author'] = author1 str_time = '<div class="explain"><span>' + item[ 'Author'] + '</span><time>' + str( item['News_Dt'].split( ' ')[0]) + '</time></div>' content1 = '<h1>' + item[ 'News_Title'] + '</h1>' + str_time + "<div class='content'>" + content1 + "</div>" data = change_content(content1, self.start_urls[0]) item['Content'] = data[0] # content2 = etree.HTML(item['Content']) img_list = data[1] get_pic(item, img_list) item['Update_Tm'] = get_time_stamp() item['Abstract'] = '' item['Keywords'] = '' item['URL'] = response.url item['Web_Id'] = '5-24' #print(item) yield item if content1 == []: content2 = response.xpath('//div[@class="article-content"]') content3 = response.xpath('//div[@class="author_des"]') if content2 != []: content2 = content2.extract_first() title = response.xpath('//h1/text()') if title != []: title = title.extract_first() time1 = response.xpath( '//div[@class="fl"]/em[3]/text()') if time1 != []: item = EleIndustryItem() time1 = time1.extract_first().replace( '年', '-').replace('月', '-').replace('日', '') time1 = time1.strip().split(' ')[0] time1 = time1 + ' ' + Get_Time() author = response.xpath( '//div[@class="fl"]/em[1]/text()') author1 = '' if author != []: author1 = author.extract_first() tag = response.xpath( '//div[@class="tag"]//span').xpath( 'string(.)').extract() tags = ','.join(tag) item['News_Title'] = title item['Author'] = author1 item['News_Dt'] = time1 item['Keywords'] = tags str_time = '<div class="explain"><span>' + item['Author'] + '</span><time>' + \ item['News_Dt'].split(' ')[0] + '</time></div>' content_1 = '<h1>' + item[ 'News_Title'] + '</h1>' + str_time + "<div class='content'>" + content2 + "</div>" data = change_content(content_1, self.start_urls[0]) item['Content'] = data[0] # content2 = etree.HTML(item['Content']) img_list = data[1] get_pic(item, img_list) item['Update_Tm'] = get_time_stamp() item['Abstract'] = '' item['URL'] = response.url item['Web_Id'] = '5-24' #print(item) yield item if content3 != []: content3 = content3.extract_first() title = response.xpath('//h1/text()') if title != []: title = title.extract_first() span_node = response.xpath( '//span[@class="float_left font-small color_gray"]' ).xpath('string(.)').extract_first() span_text = span_node.split(' ') if len(span_text) == 3: item = EleIndustryItem() author = span_text[0].replace('\r', '').replace( '\n', '').replace('\t', '').replace('发表于', '') time1 = span_text[1] + ' ' + Get_Time() tag = response.xpath( '//ul[@class="article_tags clearfix"]/li/span' ).xpath('string(.)').extract() tags = ','.join(tag) item['News_Title'] = title item['Author'] = author item['News_Dt'] = time1 item['Keywords'] = tags str_time = '<div class="explain"><span>' + item['Author'] + '</span><time>' + \ item['News_Dt'].split(' ')[0] + '</time></div>' content_1 = '<h1>' + item[ 'News_Title'] + '</h1>' + str_time + "<div class='content'>" + content3 + "</div>" data = change_content(content_1, self.start_urls[0]) item['Content'] = data[0] # content2 = etree.HTML(item['Content']) img_list = data[1] get_pic(item, img_list) item['Update_Tm'] = get_time_stamp() item['Abstract'] = '' item['URL'] = response.url item['Web_Id'] = '5-24' #print(item) yield item if content != []: content = content.extract_first() title = response.xpath('//h1/text()') if title != []: title = title.extract_first() time1 = response.xpath('//section//span[@class="time"]/text()') if time1 != []: item = EleIndustryItem() item['News_Title'] = title time1 = time1.extract_first().replace('年', '-').replace( '月', '-').replace('日', '') time1 = time1.strip().split(' ')[0] time1 = time1 + ' ' + Get_Time() item['News_Dt'] = time1 author2 = response.xpath( '//div[@class="article-info art-share-layout m-share-layout clearfix"]/a' ).xpath('string(.)').extract() author = '电子发烧友网' if author2 != []: author = author2[0] if author == '': author = '电子发烧友网' if author2 == ['']: uid = response.xpath('//input[@id="webMID"]/@value') if uid != []: uid = uid.extract_first() url = 'http://www.elecfans.com/webapi/member/getUserInfoNew/uid/{}'.format( str(uid)) data = requests.get(url).text try: data = json.loads(data)['data']['writer_uname'] author = data except Exception as E: pass item['Author'] = author tags = response.xpath( '//ul[@class="hot-main clearfix"]/li/text()').extract( ) tag = '' if tags != []: tag = ','.join(tags).replace('\n', '').replace( '\r', '').replace(' ', '').replace(',,', ',').strip(',') item['Keywords'] = tag str_time = '<div class="explain"><span>' + item['Author'] + '</span><time>' + \ item['News_Dt'].split(' ')[0] + '</time></div>' content1 = '<h1>' + item[ 'News_Title'] + '</h1>' + str_time + "<div class='content'>" + content + "</div>" data = change_content(content1, self.start_urls[0]) item['Content'] = data[0] # content2 = etree.HTML(item['Content']) img_list = data[1] get_pic(item, img_list) item['Update_Tm'] = get_time_stamp() item['Abstract'] = '' item['URL'] = response.url item['Web_Id'] = '5-24' #print(item) yield item
def parse_item2(self, response): content = response.xpath('//div[@class="art-con article_body"]') if content != []: content = content.extract_first() title = response.xpath('//h1') if title != []: item = EleIndustryItem() title = title.xpath('string(.)').extract_first().strip() node1 = response.xpath('//div[@class="detailwarn"]') if node1 != []: data = node1.xpath('string(.)').extract_first() time1 = re.findall(r'\d+-\d+-\d+', data) author = re.findall(r'作者:(.*)', data) if time1 != []: time1 = time1[0] + ' ' + Get_Time() author1 = '' if author != []: author1 = author[0].strip() abstract = response.xpath( '//span[@class="art-lead-text"]/text()' ).extract_first() Keywords = response.xpath( '//div[@class="art-relative-tags"]/a/text()' ).extract() Keywords = ','.join(Keywords) item['News_Title'] = title item['News_Dt'] = time1 item['Author'] = author1 item['Keywords'] = Keywords item['Abstract'] = abstract str_time = '<div class="explain"><span>' + item[ 'Author'] + '</span><time>' + str( item['News_Dt'].split( ' ')[0]) + '</time></div>' content1 = '<h1>' + item[ 'News_Title'] + '</h1>' + str_time + "<div class='content'>" + content + "</div>" data = change_content(content1, 'https://www.eet-china.com') item['Content'] = data[0] img_url2 = '' img = response.xpath('//div[@class="cover-img"]/@style' ).extract_first() if img is not None: img = img.replace('(', '').replace(')', '') img_url = re.search(r'url(.*)', img) if img_url: img_url1 = img_url.group(1) img_url2 = urljoin('https://www.eet-china.com', img_url1) img_list = data[1] if img_list == []: if img_url2 != '': img_list.append(img_url2) get_pic(item, img_list) item['Update_Tm'] = get_time_stamp() item['URL'] = response.url item['Web_Id'] = '5-38' #print(item) yield item