def get_detail(self):
        urls = self.get_list()

        for title in urls:
            try:
                print "Crawling =>", title, urls[title]
                targetfile = os.path.join(
                    'C:\\Users\\Administrator\\Desktop\\Projects\\Uniqueness\\files\\'
                    + title + '.csv')
                if os.path.exists(targetfile):
                    print 'pass'
                    continue
                driver = PhantomJS_driver.initialization()
                driver.get(urls[title])
                bs_obj = bs4.BeautifulSoup(driver.page_source, 'html.parser')
                driver.quit()
                #bs_obj = bs4.BeautifulSoup(requests_manager.get_html(urls[title]), 'html.parser')
                if bs_obj.find('ul', class_='areaList'):
                    bs_obj = self.area_parser(bs_obj)

                self.list_parser(bs_obj, title, targetfile)
            except:
                log_obj.error(title)
                log_obj.error(urls[title])
                log_obj.error(traceback.format_exc())
    def parse_img(self, response):
        bs_obj = bs4.BeautifulSoup(response.text, 'html.parser')
        item = response.meta['item']

        try:
            print "\n====>Running parse_img \n"
            e_div = bs_obj.find('div', id='ImageDIV')
            img_url = e_div.img.get('src')
            img_url = 'http://www.hzplanning.gov.cn' + re.sub(
                r'\?r\=.+', '', img_url)
            PhantomJS_driver.get_file(
                img_url,
                os.getcwd() + 'files\\%s.jpg' % item['monitor_title'])
            yield item
        except:
            log_obj.error(
                item['monitor_url'], "%s(%s)中无法解析\n%s" %
                (self.name, response.url, traceback.format_exc()))
            yield response.meta['item']
示例#3
0
    def parse(self, response):
        try:
            driver = PhantomJS_driver.initialization()
            driver.get(response.url)
            html_list = []

            for i in xrange(monitor_page):
                html_list.append(
                    bs4.BeautifulSoup(driver.page_source, 'html.parser'))
                driver.find_element_by_link_text('[下一页]').click()
                time.sleep(1)

            driver.quit()

            for bs_obj in html_list:
                e_table = bs_obj.find('table', class_='ZjYhN018')
                e_row = e_table.find_all('tr', class_='ZjYhN018')
                for e_tr in e_row:
                    item = announcements_monitor.items.AnnouncementsMonitorItem(
                    )
                    item['monitor_city'] = '余杭'
                    item['parcel_status'] = 'city_planning'

                    # 除去标题
                    if not e_tr.a:
                        continue

                    e_tds = e_tr.find_all('td')
                    item[
                        'monitor_id'] = self.name  #/scxx/tdsc/tdcrgg/2016-11-17/6409.html
                    item['monitor_title'] = e_tr.a.find_next('a').get_text(
                        strip=True)  # 标题
                    item['monitor_date'] = e_tds[-1].get_text(
                        strip=True)  # 成交日期
                    item[
                        'monitor_url'] = 'http://www.yuhang.gov.cn/xxgk/gggs/js' + e_tr.a.find_next(
                            'a').get('href')[1:]
                    #print "url ======>", item['monitor_url']

                    yield scrapy.Request(item['monitor_url'],
                                         meta={'item': item},
                                         callback=self.parse1,
                                         dont_filter=True)
        except:
            log_obj.update_error("%s中无法解析\n原因:%s" %
                                 (self.name, traceback.format_exc()))
    def parse(self, response):
        try:
            driver = PhantomJS_driver.initialization()
            driver.get(response.url)
            html_list = []
            monitor_page = 1  # 监控目录页数

            for i in xrange(monitor_page):
                html_list.append(
                    bs4.BeautifulSoup(driver.page_source, 'html.parser'))
                driver.find_element_by_link_text(u'[下一页]')

            driver.quit()

            for bs_obj in html_list:
                e_table = bs_obj.find('table', class_='publicityCss')
                e_row = e_table.find_all('tr')[1:]
                for e_tr in e_row:
                    item = announcements_monitor.items.AnnouncementsMonitorItem(
                    )
                    item['monitor_city'] = '杭州'
                    item['parcel_status'] = 'city_planning'

                    # 除去标题
                    if not e_tr.a:
                        continue

                    e_tds = e_tr.find_all('td')
                    item['monitor_id'] = self.name
                    item['monitor_title'] = e_tds[0].a.get_text(
                        strip=True)  # 标题
                    item['monitor_date'] = e_tds[2].get_text(
                        strip=True)  # 成交日期
                    item[
                        'monitor_url'] = 'http://www.hzplanning.gov.cn' + e_tds[
                            0].a.get('href')

                    yield scrapy.Request(item['monitor_url'],
                                         meta={'item': item},
                                         callback=self.parse1,
                                         dont_filter=True)
        except:
            log_obj.update_error("%s中无法解析\n原因:%s" %
                                 (self.name, traceback.format_exc()))
示例#5
0
    def parse(self, response):
        try:
            driver = PhantomJS_driver.initialization()
            driver.get(response.url)
            html_list = []

            for i in xrange(monitor_page):
                html_list.append(
                    bs4.BeautifulSoup(driver.page_source, 'html.parser'))
                driver.find_element_by_link_text(u'下页').click()
                time.sleep(1)

            driver.quit()

            for bs_obj in html_list:
                e_table = bs_obj.find('table',
                                      class_='xxgk-table1 table table-border')
                e_row = e_table.find_all('tr', class_='xxgk-table1-tr')
                for e_tr in e_row:
                    item = announcements_monitor.items.AnnouncementsMonitorItem(
                    )
                    item['monitor_city'] = '浙江'
                    item['parcel_status'] = 'city_planning'

                    e_tds = e_tr.find_all('td')
                    item['monitor_id'] = self.name
                    item['monitor_title'] = e_tr.a.get('title')  # 标题
                    item['monitor_date'] = e_tds[-1].get_text(
                        strip=True)  # 成交日期
                    url = re.sub(r'\.\.\/\.\.\/\.\.\/\.\.\/', '',
                                 e_tr.a.get('href'))
                    item['monitor_url'] = 'http://www.zjjs.gov.cn/' + url

                    yield scrapy.Request(item['monitor_url'],
                                         meta={'item': item},
                                         callback=self.parse1,
                                         dont_filter=True)
        except:
            log_obj.update_error("%s中无法解析\n原因:%s" %
                                 (self.name, traceback.format_exc()))
示例#6
0
    def parse(self, response):

        driver = PhantomJS_driver.initialization()
        driver.get(response.url)
        html_list = []

        for i in xrange(monitor_page):
            html_list.append(
                bs4.BeautifulSoup(driver.page_source, 'html.parser'))
            driver.find_element_by_xpath(
                "//*[@class='default_pgBtn default_pgNext']").click()
            time.sleep(1)

        driver.quit()

        try:
            for bs_obj in html_list:
                e_table = bs_obj.find('div', class_='default_pgContainer')
                e_row = e_table.find_all('table')
                for e in e_row:
                    item = announcements_monitor.items.AnnouncementsMonitorItem(
                    )
                    item['monitor_city'] = '浙江'
                    item['parcel_status'] = 'city_planning'

                    e_a = e.a
                    item['monitor_id'] = self.name
                    item['monitor_title'] = e_a.get('title')  # 标题
                    item['monitor_date'] = e_a.find_next('td').get_text(
                        strip=True)  # 成交日期
                    item['monitor_url'] = 'http://www.zjdpc.gov.cn' + e_a.get(
                        'href')

                    yield scrapy.Request(item['monitor_url'],
                                         meta={'item': item},
                                         callback=self.parse1,
                                         dont_filter=True)
        except:
            log_obj.update_error("%s中无法解析\n原因:%s" %
                                 (self.name, traceback.format_exc()))
    def parse(self, response):
        try:
            bs_obj = bs4.BeautifulSoup(PhantomJS_driver.get_html(response.url), 'html.parser')
            """在使用chrome等浏览器自带的提取extract xpath路径的时候,
                导致明明在浏览器中提取正确, 却在程序中返回错误的结果"""

            e_table = bs_obj.find('div', class_='default_pgContainer')
            e_row = e_table.find_all('li')
            for e_tr in e_row:
                item = announcements_monitor.items.AnnouncementsMonitorItem()
                item['monitor_city'] = '台州'

                item['monitor_id'] = self.name #/scxx/tdsc/tdcrgg/2016-11-17/6409.html
                item['monitor_title'] = e_tr.a.get_text(strip=True) # 标题
                item['monitor_date'] = e_tr.span.get_text(strip=True) # 成交日期 site.xpath('td[3]/text()').extract_first()
                item['monitor_url'] = "http://www.zjtzgtj.gov.cn" + e_tr.a.get('href')

                if re.search(ur'国有建设用地使用权挂牌出让公告', item['monitor_title']):
                    item['parcel_status'] = 'onsell'
                    yield scrapy.Request(item['monitor_url'],meta={'item':item},callback=self.parse1, dont_filter=True)
                elif re.search(ur'国有建设用地使用权出让结果公布', item['monitor_title']):
                    item['parcel_status'] = 'sold'
                    yield scrapy.Request(item['monitor_url'],meta={'item':item},callback=self.parse1, dont_filter=True)
                else:
import re
import traceback
import datetime
import bs4

log_path = r'%s/log/spider_DEBUG(%s).log' %(os.getcwd(),datetime.datetime.date(datetime.datetime.today()))

sys.path.append(sys.prefix + "\\Lib\\MyWheels")
sys.path.append(os.getcwd()) #########
reload(sys)
sys.setdefaultencoding('utf8')
import spider_log  ########
import spider_func
import PhantomJS_driver
import time
PhantomJS_driver = PhantomJS_driver.PhantomJS_driver()
spider_func = spider_func.spider_func()
log_obj = spider_log.spider_log() #########

with open(os.getcwd() + r'\announcements_monitor\spiders\needed_data.txt', 'r') as f:
    s = f.read()
    needed_data = s.split(',')
needed_data = [s.encode('utf8') for s in needed_data]

monitor_page = 1  # 监控目录页数

class Spider(scrapy.Spider):
    name = "500003"

    def start_requests(self):
        self.urls = ["http://www.fuyang.gov.cn/fy/ghj/jghs/index_%s.jhtml" %(i+1) for i in xrange(monitor_page)]
    def parse(self, response):

        # 统计总共多少页
        driver = PhantomJS_driver.initialization()
        driver.set_page_load_timeout(360)
        driver.get('about:blank')
        driver.get(response.url)
        driver.switch_to.frame('contentmain')
        driver.find_element_by_class_name('a1').click()

        e_div = driver.find_element_by_class_name('content_fy')
        e_div.find_element_by_link_text('末页').click()
        total_page = int(e_div.find_element_by_tag_name('strong').text)

        print "一共有%s页" % total_page
        page_dict = {(i + 1): None for i in range(total_page)}
        print page_dict
        current_row = [1, 1]
        driver.quit()

        detail_page_row = 1
        detail_page_row_count = 1
        while True:
            page, row = current_row
            try:
                driver = PhantomJS_driver.initialization()
                driver.set_page_load_timeout(360)
                driver.get('about:blank')
                driver.get(response.url)
                driver.switch_to.frame('contentmain')
                driver.find_element_by_class_name('a1').click()

                # 翻页
                read_page = 1
                for i in range(page - 1):
                    e_div = driver.find_element_by_class_name('content_fy')
                    e_div.find_element_by_link_text('下一页').click()
                    read_page = e_div.find_element_by_tag_name('strong').text
                print '目前页面,第%s页' % read_page

                # 开始分析行数据
                driver.switch_to.frame('noticelist_main')
                e_trs = driver.find_elements_by_tag_name('tr')[1:]

                # 统计每一页中有几行
                if page_dict[page] is None:
                    page_dict[page] = len(e_trs)
                    print "第%s页中有%s行" % (page, page_dict[page])

                print "分析第%s行中。。。" % row
                e_row = e_trs[row - 1]
                title0 = e_row.find_elements_by_tag_name('td')[0].text

                detail_button = e_row.find_element_by_tag_name('a')
                detail_button.click()
                driver.switch_to_window(driver.window_handles[-1])

                # 详情页还有几行数据需要点击
                #driver.get_screenshot_as_file('C:\Users\Administrator\Desktop\data.png')

                driver.switch_to.frame('contentmain')
                driver.switch_to.frame('resslist_main')

                e_table = driver.find_element_by_tag_name('table')
                e_trs = e_table.find_elements_by_tag_name('tr')[1:]
                detail_page_row_count = len(e_trs)

                e_tr = e_trs[detail_page_row - 1]

                title = e_tr.find_elements_by_tag_name('td')[1].text

                detail_button = e_tr.find_element_by_tag_name('a')
                detail_button.click()
                driver.switch_to_window(driver.window_handles[-1])

                driver.switch_to.frame('contentmain')

                # 详情页中的表格
                #table_html = bs4.BeautifulSoup(driver.page_source,'html.parser')
                #df = self.parse_table(driver.page_source)
                l = pd.read_html(driver.page_source)
                print len(l)
                df1, df2 = l[-2:]
                arr = np.array(df2).reshape(-1, 2)
                ser = pd.Series(arr[:, 1], index=arr[:, 0])
                ser.loc['挂牌起始价(RMB)'] = df1.loc[1, 1]
                ser['竞买保证金(RMB)'] = df1.loc[1, 3]
                #driver.switch_to.frame('time')
                #time0 = driver.find_element_by_tag_name('div').text
                #ser['距离挂牌开始时间'] = time0
                ser[pd.isnull(ser) == False].to_excel(
                    'C:\\Users\\Administrator\\Desktop\\files\\%s.xlsx' %
                    title0)

                # 下载文件
                #driver.switch_to.frame('contentmain')
                e_div = driver.find_element_by_class_name('tab_list')
                e_a = e_div.find_elements_by_tag_name('a')[5]
                e_a.click()
                driver.switch_to_window(driver.window_handles[-1])

                #driver.get_screenshot_as_file('C:\Users\Administrator\Desktop\data.png')
                #with open('C:\Users\Administrator\Desktop\data.html', 'w') as f:
                #    f.write(driver.page_source)

                bs_obj = bs4.BeautifulSoup(driver.page_source, 'html.parser')
                e_div = bs_obj.find('div', class_='xs_list_table')

                title1_list = [
                    e.get_text(strip=True) for e in e_div.find_all('tr')[1:]
                ]

                html_text = e_div.prettify(encoding='utf8')

                comment_list = re.findall(r'\<\!--.+?--\>', html_text, re.S)
                #print comment_list
                for i in range(len(comment_list)):
                    print "正在分析%s, 一共%s个文件" % (title1_list[i],
                                               len(comment_list))
                    s = comment_list[i]
                    m = re.search(r"(?<=\<a href\=\" ).+?(?=\")", s)
                    file_url = 'http://tdjy.zjdlr.gov.cn/GTJY_ZJ/' + m.group()
                    #file_name = re.search(r'(?<=fileName\=).+', file_url).group()
                    PhantomJS_driver.get_file(
                        file_url,
                        'C:\\Users\\Administrator\\Desktop\\files\\(%s)%s' %
                        (title, title1_list[i]))

                #yield scrapy.Request(item['monitor_url'], meta={'item': item}, callback=self.parse1, dont_filter=True)
            except:
                log_obj.update_error("%s中无法解析\n原因:%s|%s|%s\n%s" %
                                     (self.name, page, row, detail_page_row,
                                      traceback.format_exc()))

            if detail_page_row < detail_page_row_count:
                detail_page_row = detail_page_row + 1
            else:
                detail_page_row = 1
                if row == page_dict[page]:
                    row = 1
                    page = page + 1
                else:
                    row = row + 1

                if page > total_page:
                    break

            current_row = page, row

            driver.quit()