def get_detail(self): urls = self.get_list() for title in urls: try: print "Crawling =>", title, urls[title] targetfile = os.path.join( 'C:\\Users\\Administrator\\Desktop\\Projects\\Uniqueness\\files\\' + title + '.csv') if os.path.exists(targetfile): print 'pass' continue driver = PhantomJS_driver.initialization() driver.get(urls[title]) bs_obj = bs4.BeautifulSoup(driver.page_source, 'html.parser') driver.quit() #bs_obj = bs4.BeautifulSoup(requests_manager.get_html(urls[title]), 'html.parser') if bs_obj.find('ul', class_='areaList'): bs_obj = self.area_parser(bs_obj) self.list_parser(bs_obj, title, targetfile) except: log_obj.error(title) log_obj.error(urls[title]) log_obj.error(traceback.format_exc())
def parse_img(self, response): bs_obj = bs4.BeautifulSoup(response.text, 'html.parser') item = response.meta['item'] try: print "\n====>Running parse_img \n" e_div = bs_obj.find('div', id='ImageDIV') img_url = e_div.img.get('src') img_url = 'http://www.hzplanning.gov.cn' + re.sub( r'\?r\=.+', '', img_url) PhantomJS_driver.get_file( img_url, os.getcwd() + 'files\\%s.jpg' % item['monitor_title']) yield item except: log_obj.error( item['monitor_url'], "%s(%s)中无法解析\n%s" % (self.name, response.url, traceback.format_exc())) yield response.meta['item']
def parse(self, response): try: driver = PhantomJS_driver.initialization() driver.get(response.url) html_list = [] for i in xrange(monitor_page): html_list.append( bs4.BeautifulSoup(driver.page_source, 'html.parser')) driver.find_element_by_link_text('[下一页]').click() time.sleep(1) driver.quit() for bs_obj in html_list: e_table = bs_obj.find('table', class_='ZjYhN018') e_row = e_table.find_all('tr', class_='ZjYhN018') for e_tr in e_row: item = announcements_monitor.items.AnnouncementsMonitorItem( ) item['monitor_city'] = '余杭' item['parcel_status'] = 'city_planning' # 除去标题 if not e_tr.a: continue e_tds = e_tr.find_all('td') item[ 'monitor_id'] = self.name #/scxx/tdsc/tdcrgg/2016-11-17/6409.html item['monitor_title'] = e_tr.a.find_next('a').get_text( strip=True) # 标题 item['monitor_date'] = e_tds[-1].get_text( strip=True) # 成交日期 item[ 'monitor_url'] = 'http://www.yuhang.gov.cn/xxgk/gggs/js' + e_tr.a.find_next( 'a').get('href')[1:] #print "url ======>", item['monitor_url'] yield scrapy.Request(item['monitor_url'], meta={'item': item}, callback=self.parse1, dont_filter=True) except: log_obj.update_error("%s中无法解析\n原因:%s" % (self.name, traceback.format_exc()))
def parse(self, response): try: driver = PhantomJS_driver.initialization() driver.get(response.url) html_list = [] monitor_page = 1 # 监控目录页数 for i in xrange(monitor_page): html_list.append( bs4.BeautifulSoup(driver.page_source, 'html.parser')) driver.find_element_by_link_text(u'[下一页]') driver.quit() for bs_obj in html_list: e_table = bs_obj.find('table', class_='publicityCss') e_row = e_table.find_all('tr')[1:] for e_tr in e_row: item = announcements_monitor.items.AnnouncementsMonitorItem( ) item['monitor_city'] = '杭州' item['parcel_status'] = 'city_planning' # 除去标题 if not e_tr.a: continue e_tds = e_tr.find_all('td') item['monitor_id'] = self.name item['monitor_title'] = e_tds[0].a.get_text( strip=True) # 标题 item['monitor_date'] = e_tds[2].get_text( strip=True) # 成交日期 item[ 'monitor_url'] = 'http://www.hzplanning.gov.cn' + e_tds[ 0].a.get('href') yield scrapy.Request(item['monitor_url'], meta={'item': item}, callback=self.parse1, dont_filter=True) except: log_obj.update_error("%s中无法解析\n原因:%s" % (self.name, traceback.format_exc()))
def parse(self, response): try: driver = PhantomJS_driver.initialization() driver.get(response.url) html_list = [] for i in xrange(monitor_page): html_list.append( bs4.BeautifulSoup(driver.page_source, 'html.parser')) driver.find_element_by_link_text(u'下页').click() time.sleep(1) driver.quit() for bs_obj in html_list: e_table = bs_obj.find('table', class_='xxgk-table1 table table-border') e_row = e_table.find_all('tr', class_='xxgk-table1-tr') for e_tr in e_row: item = announcements_monitor.items.AnnouncementsMonitorItem( ) item['monitor_city'] = '浙江' item['parcel_status'] = 'city_planning' e_tds = e_tr.find_all('td') item['monitor_id'] = self.name item['monitor_title'] = e_tr.a.get('title') # 标题 item['monitor_date'] = e_tds[-1].get_text( strip=True) # 成交日期 url = re.sub(r'\.\.\/\.\.\/\.\.\/\.\.\/', '', e_tr.a.get('href')) item['monitor_url'] = 'http://www.zjjs.gov.cn/' + url yield scrapy.Request(item['monitor_url'], meta={'item': item}, callback=self.parse1, dont_filter=True) except: log_obj.update_error("%s中无法解析\n原因:%s" % (self.name, traceback.format_exc()))
def parse(self, response): driver = PhantomJS_driver.initialization() driver.get(response.url) html_list = [] for i in xrange(monitor_page): html_list.append( bs4.BeautifulSoup(driver.page_source, 'html.parser')) driver.find_element_by_xpath( "//*[@class='default_pgBtn default_pgNext']").click() time.sleep(1) driver.quit() try: for bs_obj in html_list: e_table = bs_obj.find('div', class_='default_pgContainer') e_row = e_table.find_all('table') for e in e_row: item = announcements_monitor.items.AnnouncementsMonitorItem( ) item['monitor_city'] = '浙江' item['parcel_status'] = 'city_planning' e_a = e.a item['monitor_id'] = self.name item['monitor_title'] = e_a.get('title') # 标题 item['monitor_date'] = e_a.find_next('td').get_text( strip=True) # 成交日期 item['monitor_url'] = 'http://www.zjdpc.gov.cn' + e_a.get( 'href') yield scrapy.Request(item['monitor_url'], meta={'item': item}, callback=self.parse1, dont_filter=True) except: log_obj.update_error("%s中无法解析\n原因:%s" % (self.name, traceback.format_exc()))
def parse(self, response): try: bs_obj = bs4.BeautifulSoup(PhantomJS_driver.get_html(response.url), 'html.parser') """在使用chrome等浏览器自带的提取extract xpath路径的时候, 导致明明在浏览器中提取正确, 却在程序中返回错误的结果""" e_table = bs_obj.find('div', class_='default_pgContainer') e_row = e_table.find_all('li') for e_tr in e_row: item = announcements_monitor.items.AnnouncementsMonitorItem() item['monitor_city'] = '台州' item['monitor_id'] = self.name #/scxx/tdsc/tdcrgg/2016-11-17/6409.html item['monitor_title'] = e_tr.a.get_text(strip=True) # 标题 item['monitor_date'] = e_tr.span.get_text(strip=True) # 成交日期 site.xpath('td[3]/text()').extract_first() item['monitor_url'] = "http://www.zjtzgtj.gov.cn" + e_tr.a.get('href') if re.search(ur'国有建设用地使用权挂牌出让公告', item['monitor_title']): item['parcel_status'] = 'onsell' yield scrapy.Request(item['monitor_url'],meta={'item':item},callback=self.parse1, dont_filter=True) elif re.search(ur'国有建设用地使用权出让结果公布', item['monitor_title']): item['parcel_status'] = 'sold' yield scrapy.Request(item['monitor_url'],meta={'item':item},callback=self.parse1, dont_filter=True) else:
import re import traceback import datetime import bs4 log_path = r'%s/log/spider_DEBUG(%s).log' %(os.getcwd(),datetime.datetime.date(datetime.datetime.today())) sys.path.append(sys.prefix + "\\Lib\\MyWheels") sys.path.append(os.getcwd()) ######### reload(sys) sys.setdefaultencoding('utf8') import spider_log ######## import spider_func import PhantomJS_driver import time PhantomJS_driver = PhantomJS_driver.PhantomJS_driver() spider_func = spider_func.spider_func() log_obj = spider_log.spider_log() ######### with open(os.getcwd() + r'\announcements_monitor\spiders\needed_data.txt', 'r') as f: s = f.read() needed_data = s.split(',') needed_data = [s.encode('utf8') for s in needed_data] monitor_page = 1 # 监控目录页数 class Spider(scrapy.Spider): name = "500003" def start_requests(self): self.urls = ["http://www.fuyang.gov.cn/fy/ghj/jghs/index_%s.jhtml" %(i+1) for i in xrange(monitor_page)]
def parse(self, response): # 统计总共多少页 driver = PhantomJS_driver.initialization() driver.set_page_load_timeout(360) driver.get('about:blank') driver.get(response.url) driver.switch_to.frame('contentmain') driver.find_element_by_class_name('a1').click() e_div = driver.find_element_by_class_name('content_fy') e_div.find_element_by_link_text('末页').click() total_page = int(e_div.find_element_by_tag_name('strong').text) print "一共有%s页" % total_page page_dict = {(i + 1): None for i in range(total_page)} print page_dict current_row = [1, 1] driver.quit() detail_page_row = 1 detail_page_row_count = 1 while True: page, row = current_row try: driver = PhantomJS_driver.initialization() driver.set_page_load_timeout(360) driver.get('about:blank') driver.get(response.url) driver.switch_to.frame('contentmain') driver.find_element_by_class_name('a1').click() # 翻页 read_page = 1 for i in range(page - 1): e_div = driver.find_element_by_class_name('content_fy') e_div.find_element_by_link_text('下一页').click() read_page = e_div.find_element_by_tag_name('strong').text print '目前页面,第%s页' % read_page # 开始分析行数据 driver.switch_to.frame('noticelist_main') e_trs = driver.find_elements_by_tag_name('tr')[1:] # 统计每一页中有几行 if page_dict[page] is None: page_dict[page] = len(e_trs) print "第%s页中有%s行" % (page, page_dict[page]) print "分析第%s行中。。。" % row e_row = e_trs[row - 1] title0 = e_row.find_elements_by_tag_name('td')[0].text detail_button = e_row.find_element_by_tag_name('a') detail_button.click() driver.switch_to_window(driver.window_handles[-1]) # 详情页还有几行数据需要点击 #driver.get_screenshot_as_file('C:\Users\Administrator\Desktop\data.png') driver.switch_to.frame('contentmain') driver.switch_to.frame('resslist_main') e_table = driver.find_element_by_tag_name('table') e_trs = e_table.find_elements_by_tag_name('tr')[1:] detail_page_row_count = len(e_trs) e_tr = e_trs[detail_page_row - 1] title = e_tr.find_elements_by_tag_name('td')[1].text detail_button = e_tr.find_element_by_tag_name('a') detail_button.click() driver.switch_to_window(driver.window_handles[-1]) driver.switch_to.frame('contentmain') # 详情页中的表格 #table_html = bs4.BeautifulSoup(driver.page_source,'html.parser') #df = self.parse_table(driver.page_source) l = pd.read_html(driver.page_source) print len(l) df1, df2 = l[-2:] arr = np.array(df2).reshape(-1, 2) ser = pd.Series(arr[:, 1], index=arr[:, 0]) ser.loc['挂牌起始价(RMB)'] = df1.loc[1, 1] ser['竞买保证金(RMB)'] = df1.loc[1, 3] #driver.switch_to.frame('time') #time0 = driver.find_element_by_tag_name('div').text #ser['距离挂牌开始时间'] = time0 ser[pd.isnull(ser) == False].to_excel( 'C:\\Users\\Administrator\\Desktop\\files\\%s.xlsx' % title0) # 下载文件 #driver.switch_to.frame('contentmain') e_div = driver.find_element_by_class_name('tab_list') e_a = e_div.find_elements_by_tag_name('a')[5] e_a.click() driver.switch_to_window(driver.window_handles[-1]) #driver.get_screenshot_as_file('C:\Users\Administrator\Desktop\data.png') #with open('C:\Users\Administrator\Desktop\data.html', 'w') as f: # f.write(driver.page_source) bs_obj = bs4.BeautifulSoup(driver.page_source, 'html.parser') e_div = bs_obj.find('div', class_='xs_list_table') title1_list = [ e.get_text(strip=True) for e in e_div.find_all('tr')[1:] ] html_text = e_div.prettify(encoding='utf8') comment_list = re.findall(r'\<\!--.+?--\>', html_text, re.S) #print comment_list for i in range(len(comment_list)): print "正在分析%s, 一共%s个文件" % (title1_list[i], len(comment_list)) s = comment_list[i] m = re.search(r"(?<=\<a href\=\" ).+?(?=\")", s) file_url = 'http://tdjy.zjdlr.gov.cn/GTJY_ZJ/' + m.group() #file_name = re.search(r'(?<=fileName\=).+', file_url).group() PhantomJS_driver.get_file( file_url, 'C:\\Users\\Administrator\\Desktop\\files\\(%s)%s' % (title, title1_list[i])) #yield scrapy.Request(item['monitor_url'], meta={'item': item}, callback=self.parse1, dont_filter=True) except: log_obj.update_error("%s中无法解析\n原因:%s|%s|%s\n%s" % (self.name, page, row, detail_page_row, traceback.format_exc())) if detail_page_row < detail_page_row_count: detail_page_row = detail_page_row + 1 else: detail_page_row = 1 if row == page_dict[page]: row = 1 page = page + 1 else: row = row + 1 if page > total_page: break current_row = page, row driver.quit()