def parse(self, response): filename = 'home.html' for project in response.css('ul.vT-srch-result-list-bid').css("li"): project_name = project.css("a::text").extract()[0].strip() province = project.css("a::text").extract()[1].strip() money_url = project.css("li").css("a::attr(href)").extract_first() #yield response.follow(money_url,self.parse_money) item = GovItem() item['name'] = project_name item['pro'] = province request = response.follow(money_url, self.parse_money) request.meta['item'] = item yield request """ yield{ 'project_name':project_name, 'province':province, 'money':money, } """ prefix = 'http://search.ccgp.gov.cn/bxsearch?searchtype=2&page_index=' suffix = '&bidSort=&buyerName=&projectId=&pinMu=&bidType=&dbselect=bidx&kw=%E5%8C%BB%E6%83%A0%E7%A7%91%E6%8A%80&start_time=2017%3A09%3A21&end_time=2017%3A12%3A22&timeType=4&displayZone=&zoneId=&pppStatus=0&agentName=' next_page = prefix + str(3) + suffix if next_page is not None: yield response.follow(next_page, callback=self.parse)
def hlj_parse(self, response): count = 1 item = GovItem() browser = response.browser while count < self.page: infos = browser.find_elements_by_css_selector('.td1 tr') for info in infos: try: if info.find_elements_by_css_selector('td'): item['title'] = info.find_element_by_css_selector( 'td:nth-of-type(2)').text detail_url = info.find_element_by_css_selector( 'tr div a').get_attribute('href') item['detail_url'] = detail_url item['department'] = info.find_element_by_css_selector( 'td:nth-of-type(3)').text item['res_date'] = info.find_element_by_css_selector( 'td:nth-of-type(4)').text self.browser2.get(detail_url) self.parse_detail(self.browser2, item) yield item except StaleElementReferenceException as e: pass count += 1 browser.execute_script( "__doPostBack('AspNetPager1','{page}')".format( page=str(count)))
def request_back(self, response): data = json.loads(response.text) if ('message' in data.keys()): message = data['message'] if message and message.lower() == 'success': max_behot_time = data['next']['max_behot_time'] data_items = data['data'] for data_item in data_items: title = data_item['title'] item_id = data_item['item_id'] sourceOrg = data_item['source'] timeStamp = data_item['behot_time'] timeArray = time.localtime(timeStamp) behotTime = time.strftime("%Y--%m--%d %H:%M:%S", timeArray) detail_url = "https://www.toutiao.com/group/" + item_id #yield scrapy.Request(detail_url, callback=self.content_request, headers=self.header3) #yield scrapy.Request(detail_url, headers=self.header2, #callback=lambda response, title=title: self.content_request(response, title)) if ('comments_count' in data_item.keys()): comment_counts = data_item['comments_count'] print("title: " + title + " item_id: " + item_id + " detail_url: " + detail_url + " comment_counts: " + str(comment_counts)) comment_url = self.create_comment_url( item_id, 0, comment_counts) #yield scrapy.Request(detail_url, callback=self.content_request, headers=self.headers) yield scrapy.Request( comment_url, headers=self.header1, callback=lambda response, title=title, detail_url= detail_url, sourceOrg=sourceOrg, behotTime= behotTime: self.comment_request( response, title, detail_url, sourceOrg, behotTime)) else: print("title: " + title + " item_id: " + item_id + " detail_url: " + detail_url) item = GovItem() item['title'] = title item['content'] = '' item['sourceOrg'] = sourceOrg item['comments'] = '' item['publishTime'] = behotTime item['sourceUrl'] = detail_url yield item rewriteUrl = 'https://www.toutiao.com/api/pc/feed/?category=news_finance&utm_source=toutiao&widen=1&max_behot_time={0}&max_behot_time_tmp={0}&tadrequire=true' rewriteUrl = rewriteUrl.format(max_behot_time) print(rewriteUrl) yield scrapy.Request(rewriteUrl, callback=self.request_back, headers=self.header1, dont_filter=True) else: print(message + " " + str(response.url).strip()) time.sleep(2) yield scrapy.Request(str(response.url).strip(), callback=self.request_back, headers=self.header1, dont_filter=True)
def cpppc_req_content(self, response, title, pub_time): content = response.xpath("//div[@class='cont']").extract()[0] item = GovItem() item['title'] = title item['content'] = content item['sourceOrg'] = '政府和社会资本合作中心' item['comments'] = '' item['publishTime'] = pub_time item['sourceUrl'] = response.url yield item
def hb_content_detail(self, response, title, pub_time): org_info = "河北省财政厅" content = response.xpath("//div[@class='content']").extract()[0] item = GovItem() item['title'] = title item['content'] = content item['sourceOrg'] = org_info item['comments'] = '' item['publishTime'] = pub_time item['sourceUrl'] = response.url yield item
def sh_content_detail(self, response, title, pub_time): content = response.xpath( "//div[@class='article_content']").extract()[0] item = GovItem() item['title'] = title item['content'] = content item['sourceOrg'] = "上海财政局" item['comments'] = '' item['publishTime'] = pub_time item['sourceUrl'] = response.url yield item
def jcz_cq_content_detail(self, response, title, pub_time): content = response.xpath("//div[@id='showcontent']").extract()[0] #org_info = response.xpath("//*[@id='text']/h3/text()").extract()[0] item = GovItem() item['title'] = title item['content'] = content item['sourceOrg'] = "重庆财政局" item['comments'] = '' item['publishTime'] = pub_time item['sourceUrl'] = response.url yield item
def zbr_req_content(self, response, title): pub_time = response.xpath("//div[@class='project_d_t']/span/text()").extract()[0] content = response.xpath("//div[@class='project_d_c']").extract()[0] item = GovItem() item['title'] = title item['content'] = content item['sourceOrg'] = '智博睿' item['comments'] = '' item['publishTime'] = pub_time item['sourceUrl'] = response.url yield item
def tj_content_detail(self, response, title, pub_time): content = response.xpath("//div[@id='zoom']").extract()[0] org_info = response.xpath( "//table[@id='c']/tr[3]/td/span[2]/text()").extract()[0] source_org = org_info.replace('来源:', '') item = GovItem() item['title'] = title item['content'] = content item['sourceOrg'] = source_org item['comments'] = '' item['publishTime'] = pub_time item['sourceUrl'] = response.url yield item
def szfb_content_detail(self, response, title, pub_time): org_info = response.xpath( "//div[@class='tit']/h6/span[1]/text()").extract()[0] content = response.xpath( "//div[@class='news_cont_d_wrap']").extract()[0] item = GovItem() item['title'] = title item['content'] = content item['sourceOrg'] = org_info.replace('信息来源:', '') item['comments'] = '' item['publishTime'] = pub_time item['sourceUrl'] = response.url yield item
def bjcz_content_detail(self, response, title): pub_time = response.xpath( "//span[@style='display: inline-block;margin:15px 10px;font-size: 14px;'][1]/text()" ).extract()[0] content = response.xpath("//div[@class='txt']").extract()[0] item = GovItem() item['title'] = title item['content'] = content item['sourceOrg'] = "北京财政局" item['comments'] = '' item['publishTime'] = pub_time item['sourceUrl'] = response.url yield item
def parse(self, response): hot_list = [ 'http://top.baidu.com/buzz?b=342&c=513&fr=topbuzz_b42_c513', 'http://top.baidu.com/buzz?b=341&c=513&fr=topbuzz_b42_c513', 'http://top.baidu.com/buzz?b=42&c=513&fr=topbuzz_b342_c513' ] sub_name_list = [] for hot_url in hot_list: sourceReq = requests.get(hot_url) sourceReq.encoding = 'gb2312' sourceHtml = sourceReq.text selector = etree.HTML(sourceHtml) items = selector.xpath("//table[@class='list-table']/tr") if 'b=342' in hot_url: hot_type = '民生热点' elif 'b=341' in hot_url: hot_type = '今日热点' else: hot_type = '七日热点' count_index = 1 for item in items: if count_index != 1: subject_name = item.xpath( "./td[@class='keyword']/a[@class='list-title']/text()" )[0] hot_num = item.xpath("./td[@class='last']/span/text()")[0] icon_statu = item.xpath( "./td[@class='keyword']/span/@class") status = '' if icon_statu: if 'icon-new' in icon_statu[0]: status = '新' print(status) govItem = GovItem() govItem['subName'] = subject_name govItem['hotNum'] = hot_num govItem['hotType'] = hot_type govItem['status'] = status sub_name_list.append(govItem) count_index = count_index + 1 if len(sub_name_list) > 0: dist_list = [] # 去掉重复主题 unique = collections.OrderedDict() for govItem in sub_name_list: unique.setdefault(govItem["subName"], govItem) for item in unique.values(): dist_list.append(item) yield item print(len(dist_list))
def content_request(self, response, title): str_body = response.body.decode(response.encoding) print(str_body) content = self.parse_page_detail(str_body) if content is None: content = title #print(content) item = GovItem() item['title'] = title item['content'] = content item['sourceOrg'] = '头条' item['publishTime'] = '' item['sourceUrl'] = response.url item['comments'] = '' print(item['content'])
def gd_content_detail(self, response, title, pub_time): org_info = response.xpath( "//div[@class='meta']/div/span[2]/text()").extract()[0] source_org = org_info.replace('来源:', '') if source_org is '': source_org = "广东财政局" content = response.xpath("//div[@class='content']/p").extract()[0] item = GovItem() item['title'] = title item['content'] = content item['sourceOrg'] = source_org item['comments'] = '' item['publishTime'] = pub_time item['sourceUrl'] = response.url yield item
def comment_request(self, response, title, detail_url, sourceOrg, behotTime): comment_datas = json.loads(response.text)['data']['comments'] item = GovItem() item['title'] = title item['content'] = '' item['sourceOrg'] = sourceOrg item['publishTime'] = behotTime item['sourceUrl'] = detail_url # 默认大小5 comments_list = [] for comment in comment_datas: comments_list.append(comment['text']) item['comments'] = comments_list yield item
def parse(self, response): filename = 'home.html' items = [] for project in response.css('ul.vT-srch-result-list-bid').css("li"): item = GovItem() item['name'] = project.css("a::text").extract()[0] item['pro'] = project.css("a::text").extract()[1] item['info'] = project.css("p::text").extract() items.append(item) #prefix='http://search.ccgp.gov.cn/bxsearch?searchtype=2&page_index=' #suffix='&bidSort=&buyerName=&projectId=&pinMu=&bidType=&dbselect=bidx&kw=%E5%8C%BB%E6%83%A0%E7%A7%91%E6%8A%80&start_time=2017%3A09%3A21&end_time=2017%3A12%3A22&timeType=4&displayZone=&zoneId=&pppStatus=0&agentName=' #next_page=prefix+str(2)+suffix #if next_page is not None: # yield response.follow(next_page,callback=self.parse) return items
def mof_cn_content_detail(self, response, title, pub_time): org_path = '//*[@id="tb_select"]/option/text()' source_org = response.xpath(org_path).extract() if len(source_org) == 0: source_org = '财政部' else: source_org = source_org[0] content_path = '//*[@id="Zoom"]' content = response.xpath(content_path).extract()[0] item = GovItem() item['title'] = title item['content'] = content item['sourceOrg'] = source_org item['comments'] = '' item['publishTime'] = pub_time item['sourceUrl'] = response.url yield item
def _real_parse_item(self, response): item = GovItem(domain_collection=None, html=None, pdf=[], xls=[], images=[], others=[]) # 1.保存html filename = make_file_name(response.url, 'html') item['html'] = filename domain = response.url.split('/')[2] item['domain_collection'] = md5_encode(domain) abpath = DATA_DIR + item['domain_collection'] if not os.path.exists(abpath): # 第一次创建文件夹 os.makedirs(abpath) with open(abpath + '/' + filename, 'wb') as f: f.write(response.body) # 2.保存其他资源 images = response.selector.xpath('//img/@src').extract() pdf = response.selector.xpath( '//a/@href[contains(.,".pdf")]').extract() xls = response.selector.xpath( '//a/@href[contains(.,".xls")]').extract() urls = images + pdf + xls if urls: for url in urls: """ url = response.urljoin(url) self.logger.info(url) yield scrapy.Request( "http://localhost:8050/render.html?url=" + url, callback=self.save_files, cb_kwargs=dict(item=item) ) """ yield response.follow(url, callback=self.save_files, cb_kwargs=dict(item=item))
def sc_content_detail(self, response): title = response.xpath("//div[@class='infoTxt']/p/text()").extract()[0] org_info = response.xpath( "//div[@class='infoTxt']/div[1]/div[1]/text()").extract()[0] pub_time = response.xpath( "//div[@class='infoTxt']/div[1]/div[2]/text()").extract()[0] content = response.xpath("//div[@class='txt2-in']").extract()[0] source_org = org_info.replace('信息来源:', '') if source_org is '': source_org = "四川财政局" item = GovItem() item['title'] = title item['content'] = content item['sourceOrg'] = source_org item['comments'] = '' item['publishTime'] = pub_time item['sourceUrl'] = response.url yield item
def req_bjh_content(self, response, title, source_org, pub_time, source_url, keyword): contents = response.xpath("//div[@class='article-content']/p").xpath( 'string(.)').extract() print('++++++++++++') content = '' for item in contents: content = content + item + '\n' print(content) govItem = GovItem() govItem['title'] = title govItem['content'] = content govItem['sourceOrg'] = source_org govItem['comments'] = '' govItem['publishTime'] = pub_time govItem['sourceUrl'] = source_url govItem['subName'] = keyword yield govItem
def zj_parse(self, response): item = GovItem() browser = response.browser infos = browser.find_elements_by_css_selector('tr[valign=top]') for info in infos: try: if info.find_element_by_css_selector('a'): #如果元素中含有a item['title'] = info.find_element_by_css_selector( '.text').text detail_url = info.find_element_by_css_selector( 'a').get_attribute('href') item['detail_url'] = detail_url self.browser2.get(detail_url) self.parse_detail(self.browser2, item) yield item except (StaleElementReferenceException, NoSuchElementException): pass self.count += 1 yield Request(self.start_url.format(page=self.count), self.zj_parse)
def bd_req_page(self, response): items = response.xpath("//div[@class='result']") for item in items: #detail_url = item.xpath("./div/span/a/@href").extract()[0] title = item.xpath("./h3/a/text()").extract()[0].replace(' ', '').replace('\\n', '') '''source_time = item.xpath("./div/p/text()").extract()[0].split( )# 以空格为分隔符,包含 \n if len(source_time) == 3: source_org = source_time[0] pub_time = source_time[1]+source_time[2] elif len(source_time) ==2: source_org = source_time[0] pub_time = source_time[1] else: source_org = source_time[0] pub_time = str(datetime.date.today()) if pub_time: if '前' in pub_time: pub_time = str(datetime.date.today()) print (source_org +'\n' +pub_time)''' #pub_time = source_time[1] content = item.xpath("./div").extract() pattern1 = re.compile(r'(?<=(</p>))\S*?(?=(<spanclass))') match_content = pattern1.search(str(content).replace(' ', '')) whole_content = '' if match_content: whole_content = match_content.group() '''len_content = len(content) i = 0 whole_content = '' while i <len_content: whole_content = whole_content + content[i] i = i +1''' whole_content = whole_content.replace('<em>', '').replace('</em>','').replace('\\n', '') item = GovItem() item['title'] = title item['content'] = whole_content item['sourceOrg'] = 'baidu' item['comments'] = '' item['publishTime'] = str(datetime.date.today()) item['sourceUrl'] = 'www.baidu.com/news/search' yield item
def hn_parse(self, response): item = GovItem() browser = response.browser letters =browser.find_elements_by_css_selector('div.myxjgs-content div table tbody tr') for letter in letters: try: item['title'] = letter.find_element_by_css_selector('td:nth-child(1) a').text detail_url = letter.find_element_by_css_selector('td:nth-child(1) a').get_attribute('href') item['detail_url'] = detail_url item['department'] = letter.find_element_by_css_selector('td:nth-child(2) a').text item['raise_date'] = letter.find_element_by_css_selector('td:nth-child(3) span').text item['res_date'] = letter.find_element_by_css_selector('td:nth-child(4) span').text # 可能是none self.browser2.get(detail_url) self.parse_detail(self.browser2,item) yield item except (NoSuchElementException,StaleElementReferenceException): pass if self.offset<self.page: self.offset +=15 yield Request(self.basic_url.format(offset =self.offset),self.hn_parse)
def bj_parse(self, response): item = GovItem() browser = response.browser letters = browser.find_elements_by_css_selector('#newLetterReply li') for letter in letters: try: item['title'] = letter.find_element_by_css_selector( 'p.font14.mymail_title a span').text detail_url = letter.find_element_by_css_selector( 'p.font14.mymail_title a').get_attribute('href') item['detail_url'] = detail_url item['department'] = letter.find_element_by_css_selector( '.font12.gray .mail_margin[name]').text self.browser2.get(detail_url) self.parse_detail(self.browser2, item) yield item except (NoSuchElementException, StaleElementReferenceException): pass self.count += 1 yield Request( self.beijing_url.format(PCon=self.count, type='nextPage'), self.bj_parse)
def sh_parse(self, response): item = GovItem() browser = response.browser infos = browser.find_elements_by_css_selector('#FBList tr') for info in infos: try: if info.find_element_by_css_selector('a'): #如果元素中含有a标签 item['title'] = info.find_element_by_css_selector( 'td a').text detail_url = info.find_element_by_css_selector( 'td a').get_attribute('href') item['detail_url'] = detail_url item['department'] = info.find_element_by_css_selector( 'span').text item['res_date'] = info.find_element_by_css_selector( 'td:nth-of-type(6)').text self.browser2.get(detail_url) self.parse_detail(self.browser2, item) yield item except NoSuchElementException as e: pass self.count += 1 yield Request(self.start_url.format(page=self.count), self.sh_parse)
def zj_content_detail(self, response): str_body = response.body.decode(response.encoding) # 获取总页数 pattern1 = re.compile(r'(?<=(发布日期:))\S*?(?=(</td))') match_time = pattern1.search(str_body) pub_time = '' if match_time: pub_time = match_time.group() pattern2 = re.compile(r'(?<=(信息来源]>begin-->))\S*?(?=(<!))') match_org = pattern2.search(str_body) org_info = '浙江财政局' if match_org: org_info = match_org.group() content = response.xpath("//div[@id='zoom']").extract()[0] title = response.xpath("//title/text()").extract()[0] item = GovItem() item['title'] = title item['content'] = content item['sourceOrg'] = org_info item['comments'] = '' item['publishTime'] = pub_time item['sourceUrl'] = response.url yield item
def item_parse(self, publish_time, title, content, comment_url, thread_id, source_org): detail_comment_url = self.create_comment_url(thread_id, 30, 0) item = GovItem() item['title'] = title item['content'] = content item['sourceOrg'] = source_org item['publishTime'] = publish_time item['sourceUrl'] = comment_url print("title: " + title + " publish_time: " + publish_time + " source_org: " + source_org + " comment_url: " + comment_url + "publishTime " + publish_time + "content " + content) # 第一次获取总数 data = self.comment_parse(detail_comment_url) list_size = data.get('newListSize') if list_size != 0: if list_size <= 300: limit = 30 else: limit = 40 #只能最大40 不然会报分页参数错误 page_num = math.ceil(list_size / limit) comments_list = [] page = 0 while page < page_num: offset = page * limit detail_comment_url = self.create_comment_url( thread_id, limit, offset) print(detail_comment_url) data = self.comment_parse(detail_comment_url) for key in data['comments'].keys(): user_name = jsonpath.jsonpath(data['comments'][key], '$..nickname') if user_name != False: user_name = user_name[0] else: user_name = '' location = jsonpath.jsonpath(data['comments'][key], '$..location') if location != False: location = location[0] else: location = '' timeArray = time.strptime( data['comments'][key]['createTime'].strip(), "%Y-%m-%d %H:%M:%S") timeStamp = int(time.mktime(timeArray) * 1000) json_str = { "content": data['comments'][key]['content'].replace('[', '').replace( ']', ''), "userName": user_name, "place": location, "hotNum": data['comments'][key]['vote'], "publishDate": timeStamp } comments_list.append(json_str) page = page + 1 item['comments'] = comments_list else: item['comments'] = '' yield item
def bd_req_page(self, response, keyword): items = response.xpath("//div[@class='result']") for item in items: title = item.xpath("./h3/a").xpath( 'string(.)').extract()[0].replace('\n', '').replace(' ', '') source_time = item.xpath("./div/p").xpath('string(.)').extract()[0].replace('\n', '').replace('\t', '') \ .replace(' ', '').split('\xa0') # 以空格为分隔符,包含 \n source_org = source_time[0] whole_time = source_time[2] if '小时前' in whole_time: # 当前时间毫秒 t = time.time() timeStamp = int(round(t * 1000)) hoursStamp = int(whole_time.replace('小时前', '')) * 60 * 60 * 1000 pub_time = timeStamp - hoursStamp elif '分钟前' in whole_time: t = time.time() timeStamp = int(round(t * 1000)) hoursStamp = int(whole_time.replace('分钟前', '')) * 60 * 1000 pub_time = timeStamp - hoursStamp else: # 2018年07月18日10:05 format_time = whole_time.replace('年', '-').replace( '月', '-').replace('日', ' ') timeArray = time.strptime(format_time, "%Y-%m-%d %H:%M") pub_time = int(time.mktime(timeArray) * 1000) source_url = item.xpath("./h3/a/@href").extract()[0] if 'baijiahao.baidu.com' in source_url: print('baijiahao') yield scrapy.Request( source_url, callback=lambda response, title=title, source_org= source_org, pub_time=pub_time, source_url=source_url, keyword=keyword: self.req_bjh_content( response, title, source_org, pub_time, source_url, keyword)) '''sourceReq = requests.get(source_url) sourceReq.encoding = 'utf-8' sourceHtml = sourceReq.text selector = etree.HTML(sourceHtml) content_items = selector.xpath("//div[@class='article-content']/p") for content_item in content_items: print(content_item.xpath("./text()"))''' else: print('not baijiahao') str_content = item.xpath("./div").extract()[0].replace( '\n', '').replace('\t', '').replace(' ', '') pattern1 = re.compile(r'(?<=(</p>))\S*?(?=(<span))') match_content = pattern1.search(str_content) if match_content: content = match_content.group().replace('<em>', '').replace( '</em>', '') else: content = item.xpath("./div/text()").extract()[0] govItem = GovItem() govItem['title'] = title govItem['content'] = content govItem['sourceOrg'] = source_org govItem['comments'] = '' govItem['publishTime'] = pub_time govItem['sourceUrl'] = source_url govItem['subName'] = keyword yield govItem