Exemplo n.º 1
0
class Spider(scrapy.Spider):
    name = "meeting"
    allowed_domains = ["tncc.gov.tw"]
    start_urls = ["http://www.tncc.gov.tw",]
    download_delay = 0.5
    county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1]
    election_year = common.election_year(county_abbr)
    output_path = common.meeting_minutes_output_path(county_abbr, election_year)
    ads = {'2010': 1, '2014': 2, '2018': 3}
    ad = ads[election_year]

    def parse(self, response):
        return response.follow(response.xpath(u'//a[re:test(., "^出版品$")]/@href').extract_first(), callback=self.parse_list)

    def parse_list(self, response):
        for tr in response.css('#table2 tr'):
            link = tr.xpath(u'descendant::a[re:test(., "^第%d屆")]/@href' % self.ad).extract_first()
            if link:
                item = {}
                item['election_year'] = self.election_year
                item['date'] = common.ROC2AD(tr.xpath('td[1]/text()').extract_first())
                item['meeting'] = tr.xpath('td[3]/descendant::a/text()').extract_first()
                item['meeting'] = item['meeting'].replace('.', u'、')
                item['download_url'] = urljoin(response.url, link)
                ext = item['download_url'].split('.')[-1]
                file_name = '%s.%s' % (item['meeting'], ext)
                cmd = 'mkdir -p %s && wget --no-check-certificate -c -O %s%s "%s"' % (self.output_path, self.output_path, file_name, item['download_url'])
                retcode = subprocess.call(cmd, shell=True)
                yield item
Exemplo n.º 2
0
class Spider(scrapy.Spider):
    name = "bills"
    allowed_domains = ["tccc.gov.tw", ]
    start_urls = ["http://proposal.tccc.gov.tw/test/"]
    download_delay = 0.5
    county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1]
    election_year = common.election_year(county_abbr)
    ads = {'2010': 1, '2014': 2, '2018': 3}
    ad = ads[election_year]

    def parse(self, response):
        payload = dict(zip(['account', 'pw'], response.xpath('//text()').re(u'帳號:(\S*)  密碼:(\S*)')))
        return scrapy.FormRequest.from_response(response, formname='form1', formdata=payload, callback=self.parse_logined)

    def parse_logined(self, response):
        for link in response.xpath(u'//a[re:test(., "提案查詢$")]/@href'):
            yield response.follow(link, callback=self.parse_query, meta={'dont_redirect': True})

    def parse_query(self, response):
        for value in response.xpath(u'//select[@name="SPeriod"]/optgroup[re:test(., "%s屆")]/option/@value' % self.ad).extract():
            payload = {'SPeriod': value}
            yield scrapy.FormRequest.from_response(response, formname='form1', formdata=payload, callback=self.parse_list)

    def parse_list(self, response):
        for node in response.xpath('//*[count(td)=11][position()>1]'):
            item = {}
            item['election_year'] = self.election_year
            item['id'] = node.xpath('td[1]/input/@value').extract_first().zfill(6)
            item['category'] = node.xpath('td[3]/text()').extract_first().split('-')[-1].strip()
            item['type'] = re.sub('\s', '', node.xpath('td[4]/text()').extract_first())
            item['proposed_by'] = node.xpath('td[5]/text()').extract_first().strip().split(u'、')
            item['petitioned_by'] = node.xpath('td[6]/text()').extract_first().strip().split(u'、') if node.xpath('td[6]/text()').extract_first() else []
            item['bill_no'] = re.sub('\s', '', node.xpath('td[7]/text()').extract_first())
            item['abstract'] = node.xpath('td[8]/descendant-or-self::*/text()').extract_first()
            item['execution'] = re.sub('\s', '', node.xpath('td[10]/text()').extract_first())
            link = urljoin(response.url, 'html_e_print.php?id=%s' % item['id'])
            yield response.follow(link, callback=self.parse_profile, meta={'item': item})
        next_page = response.xpath(u'//a[re:test(., "下一頁")]/@href').extract_first()
        if next_page:
            yield response.follow(next_page, callback=self.parse_list)

    def parse_profile(self, response):
        item = response.meta['item']
        item['description'] = response.xpath(u'//td[re:test(., "理[\s ]*由")]/following-sibling::td/descendant-or-self::*/text()').extract_first()
        item['methods'] = response.xpath(u'//td[re:test(., "辦[\s ]*法")]/following-sibling::td/descendant-or-self::*/text()').extract_first()
        item['motions'] = []
        for motion in [u'審查意見', u'大會議決']:
            resolution = response.xpath(u'//td[re:test(., "%s")]/following-sibling::td/descendant-or-self::*/text()' % u'[\s ]*'.join(motion)).extract_first()
            if resolution:
                item['motions'].append(dict(zip(['motion', 'resolution', 'date'], [motion, resolution, None])))
        item['links'] = [
            {
                'url': response.url,
                'note': 'original'
            },
        ]
        yield item
Exemplo n.º 3
0
class Spider(scrapy.Spider):
    name = "bills"
    allowed_domains = ["ntp.gov.tw"]
    start_urls = ['http://www.ntp.gov.tw/index.aspx?FType=mb']
    download_delay = 0.5
    county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1]
    election_year = common.election_year(county_abbr)
    ads = {'2010': 1, '2014': 2, '2018': 3}
    ad = ads[election_year]

    def parse(self, response):
        return response.follow(response.xpath(u'//img[@alt="議事資訊"]/parent::a/@href').extract_first(), callback=self.parse_tab)

    def parse_tab(self, response):
        return response.follow(response.xpath(u'//a[re:test(., "議案查詢")]/@href').extract_first(), callback=self.parse_frame)

    def parse_frame(self, response):
        return response.follow(response.xpath(u'//iframe[@title="議案查詢介面"]/@src').extract_first(), callback=self.parse_query)

    def parse_query(self, response):
        payload = {'dd_MJ': response.xpath(u'//select[@name="dd_MJ"]/option[re:test(., "%s")]/@value' % self.ad).extract_first()}
        return scrapy.FormRequest.from_response(response, formname='Form1', formdata=payload, callback=self.parse_list)

    def parse_list(self, response):
        for node in response.xpath('//table[@id="dg_List"]/tr[position()>1 and position()<last()]'):
            yield response.follow(node.xpath('td[1]/input/@onclick').re(u"open\('(.*?=\d+)")[0], callback=self.parse_profile)
        if response.css('.MultiPageButtonFont span::text').re('1$'):
            payload = {name: None for name in response.xpath('////input[not(@type="hidden")]/@name').extract()}
            for page in response.css('.MultiPageButtonFont').xpath('descendant::span[1]/following-sibling::a'):
                payload['__EVENTTARGET'] = page.xpath('@href').re("doPostBack\('([^']*)'")[0]
                yield scrapy.FormRequest.from_response(response, formdata=payload, callback=self.parse_list, dont_filter=True, dont_click=True, headers=common.headers(self.county_abbr))

    def parse_profile(self, response):
        item = {}
        item['election_year'] = self.election_year
        item['id'] = re.search(u'BillNO=(\d+)', response.url).group(1).zfill(6)
        item['type'] = response.xpath('//span[@id="lab_BillType"]/text()').extract_first().strip() if response.xpath('//span[@id="lab_BillType"]/text()').extract() else ''
        item['category'] = response.xpath('//span[@id="lab_BillClass"]/text()').extract_first().strip() if response.xpath('//span[@id="lab_BillClass"]/text()').extract() else ''
        item['proposed_by'] = response.xpath('//span[@id="lab_Provider"]/text()').extract_first().strip().split(u',') if response.xpath('//span[@id="lab_Provider"]/text()').extract() else []
        item['petitioned_by'] = (response.xpath('//span[@id="lab_SupportMan"]/text()').extract_first() or '').strip().split(u',')
        item['abstract'] = '\n'.join([re.sub('\s', '', x) for x in response.xpath('//span[@id="lab_Reason"]/div//text()').extract()])
        item['description'] = '\n'.join([re.sub('\s', '', x) for x in response.xpath('//span[@id="lab_Description"]/div//text()').extract()])
        item['methods'] = '\n'.join([re.sub('\s', '', x) for x in response.xpath('//span[@id="lab_Method"]/div/text()').extract()])
        motions = []
        for motion, id in [(u'市府回覆', 'dg_Response__ctl2_lab_dgReplyDesc'), (u'一讀決議', 'lab_OneResult'), (u'審查意見', 'lab_ExamResult'), (u'大會決議', 'lab_Result'), (u'二讀決議', 'lab_TwoResult'), (u'三讀決議', 'lab_ThreeResult'), ]:
            content = '\n'.join([re.sub('\s', '', x) for x in response.xpath('//span[@id="%s"]//text()' % id).extract()])
            if content:
                motions.append(dict(zip(['motion', 'resolution', 'date'], [motion, content, None])))
        item['motions'] = motions
        item['links'] = [
            {
                'url': response.url,
                'note': 'original'
            }
        ]
        return item
class Spider(scrapy.Spider):
    name = "meeting"
    allowed_domains = ["www.tycc.gov.tw"]
    start_urls = [
        "http://www.tycc.gov.tw/content/public/public_main.aspx?wtp=1&wnd=217",
    ]
    download_delay = 0.5
    county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1]
    election_year = common.election_year(county_abbr)
    output_path = common.meeting_minutes_output_path(county_abbr,
                                                     election_year)
    election_years_ad = {'2014': '1', '2010': '17'}
    ad = election_years_ad[election_year]

    def parse(self, response):
        nodes = response.xpath(
            u'//tr/td[re:test(@title, "第%s屆")]/following-sibling::td/a[re:test(., "會$")]'
            % self.ad)
        for node in nodes:
            item = {}
            item['election_year'] = self.election_year
            item['download_url'] = urljoin(
                response.url,
                node.xpath('@href').extract_first().strip())
            item['sitting'] = u'第%s屆' % self.ad
            item['meeting'] = node.xpath('descendant::*/text()').re(
                u'屆(.+會)$')[0]
            item['meeting'] = item['meeting'].replace('.', u'、')
            ext = node.xpath('@href').extract_first().split('.')[-1]
            file_name = '%s_%s.%s' % (item['sitting'], item['meeting'], ext)
            cmd = 'mkdir -p %s && wget -c -O %s%s "%s"' % (
                self.output_path, self.output_path, file_name,
                item['download_url'])
            retcode = subprocess.call(cmd, shell=True)
            yield item
        nodes = response.xpath(
            u'//tr/td[re:test(@title, "第%s屆")]/following-sibling::td/a[re:test(., "(冊|pdf)$")]'
            % self.ad)
        for node in nodes:
            item = {}
            item['election_year'] = self.election_year
            item['download_url'] = urljoin(
                response.url,
                node.xpath('@href').extract_first().strip())
            item['sitting'] = u'第%s屆' % self.ad
            item['meeting'] = '%s%s' % (
                node.xpath('preceding::td[1]/text()').re(u'屆(.+會)')[0],
                node.xpath('descendant::*/text()').re(u'(.+冊)')[0])
            item['meeting'] = item['meeting'].replace('.', u'、')
            ext = node.xpath('@href').extract_first().split('.')[-1]
            file_name = '%s_%s.%s' % (item['sitting'], item['meeting'], ext)
            cmd = 'mkdir -p %s && wget -c -O %s%s "%s"' % (
                self.output_path, self.output_path, file_name,
                item['download_url'])
            retcode = subprocess.call(cmd, shell=True)
            yield item
Exemplo n.º 5
0
class Spider(scrapy.Spider):
    name = "bills"
    allowed_domains = ["taitungcc.gov.tw", ]
    start_urls = ["http://www.taitungcc.gov.tw"]
    download_delay = 0.5
    county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1]
    election_year = common.election_year(county_abbr)
    ads = {'2009': 17, '2014': 18, '2018': 19}
    ad = ads[election_year]

    def parse(self, response):
        return response.follow(response.xpath(u'//a[re:test(., "^議案查詢$")]/@href').extract_first(), callback=self.parse_list)

    def parse_list(self, response):
        pages = response.css('#BodyContent_PageHelpWuc1_lbTotalInFo::text').extract_first()
        print pages
        for node in response.css('table.list3 tbody tr'):
            node_ad = int(node.xpath('td[1]/text()').re(u'(\d+)\s*屆')[0])
            if node_ad < self.ad:
                break
            if node_ad > self.ad:
                continue
            yield response.follow(node.xpath('@onclick').re("href='(.*)'")[0], callback=self.parse_profile)
        next_page = response.xpath(u'//input[re:test(@value, "下一頁")][not(@disabled)]')
        if next_page:
            payload = {next_page.xpath('@name').extract_first(): next_page.xpath('@value').extract_first()}
            yield scrapy.FormRequest.from_response(response, formdata=payload, callback=self.parse_list, dont_filter=True, dont_click=True, headers=common.headers(self.county_abbr))

    def parse_profile(self, response):
        item = {}
        item['election_year'] = self.election_year
        item['id'] = re.search('=([^&]*)', response.url).group(1).zfill(6)
        item['bill_id'] = response.xpath(u'(//td[re:test(., "^案[\s ]*號$")]/following-sibling::td)[1]/text()').extract_first()
        item['category'] = re.search(u'.*?類', response.xpath(u'(//td[re:test(., "^案[\s ]*號$")]/following-sibling::td)[1]/text()').extract_first()).group(0)
        for key, label in [('type', u'議案分類'), ('abstract', u'案由'), ('description', u'說明'), ('methods', u'辦法')]:
                content = response.xpath(u'(//td[re:test(., "^%s$")]/following-sibling::td)[1]/text()' % label).extract_first()
                if content:
                    item[key] = content.strip()
        item['proposed_by'] = re.sub(u'(副?議長|議員)', '', response.xpath(u'(//td[re:test(., "^(動議|提案|請願)(單位|人)(姓名)?$")]/following-sibling::td)[1]/text()').extract_first()).strip().split(u'、')
        item['petitioned_by'] = re.sub(u'(副?議長|議員)', '', (response.xpath(u'(//td[re:test(., "^(連署|附議)人$")]/following-sibling::td)[1]/text()').extract_first() or '')).strip().split(u'、')
        item['motions'] = []
        for motion in [u'審查意見', u'大會決議']:
            resolution = response.xpath(u'(//td[re:test(., "^%s$")]/following-sibling::td)[1]/text()' % motion).extract_first()
            if resolution:
                item['motions'].append(dict(zip(['motion', 'resolution', 'date'], [motion, resolution.strip(), None])))
        item['links'] = [
            {
                'url': response.url,
                'note': 'original'
            }
        ]
        return item
Exemplo n.º 6
0
class Spider(scrapy.Spider):
    name = "meeting"
    allowed_domains = ["cissearch.kcc.gov.tw"]
    start_urls = [
        "http://cissearch.kcc.gov.tw/System/MeetingRecord/Default.aspx",
    ]
    download_delay = 0.5
    county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1]
    election_year = common.election_year(county_abbr)
    output_path = common.meeting_minutes_output_path(county_abbr,
                                                     election_year)

    def parse(self, response):
        count = response.xpath(
            '//span[@id="ContentPlaceHolder1_DataPager1"]/text()').re(
                u'共\s*(\d+)\s*筆')[0]
        payload = {
            'ctl00$ContentPlaceHolder1$DataPager1$ctl02$txtPageSize': count
        }
        yield scrapy.FormRequest.from_response(response,
                                               response.url,
                                               formdata=payload,
                                               callback=self.parse_profile,
                                               dont_filter=True)

    def parse_profile(self, response):
        trs = response.xpath('//table[@id="ContentPlaceHolder1_gvIndex"]/tr')
        for tr in trs:
            item = {}
            tds = tr.xpath('td')
            if tds:
                item['election_year'] = self.election_year
                item['date'] = common.ROC2AD(
                    tds[1].xpath('text()').extract_first())
                meeting = tds[2].xpath('text()').extract_first()
                item['meeting'] = tds[2].xpath('text()').re(
                    u'(.+?)[紀記][錄錄]')[0]
                item['download_url'] = urljoin(
                    response.url,
                    tds[3].xpath('a/@href').extract_first().strip())
                ext = item['download_url'].split('.')[-1]
                file_name = '%s.%s' % (item['meeting'], ext)
                cmd = 'mkdir -p %s && wget -c -O %s%s "%s"' % (
                    self.output_path, self.output_path, file_name,
                    item['download_url'])
                retcode = subprocess.call(cmd, shell=True)
                time.sleep(1)
                yield item
Exemplo n.º 7
0
class Spider(scrapy.Spider):
    name = "bills"
    allowed_domains = ["ptcc.gov.tw", ]
    start_urls = ["http://www.ptcc.gov.tw"]
    download_delay = 0.5
    county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1]
    election_year = common.election_year(county_abbr)
    ads = {'2005': 16, '2009': 17, '2014': 18, '2018': 19}
    ad = ads[election_year]


    def parse(self, response):
        return response.follow(response.xpath(u'//a[img[@id="topmenu03"]]/@href').extract_first(), callback=self.parse_tab)

    def parse_tab(self, response):
        return response.follow(response.xpath(u'//a[re:test(., "^議員介紹$")]/@href').extract_first(), callback=self.parse_query)

    def parse_query(self, response):
        for link in response.css('.list.borderleft a::attr(href)'):
            yield response.follow(link, callback=self.parse_profile)

    def parse_profile(self, response):
        for node in response.xpath(u'(//td[re:test(., "提[\s ]*案")]/following-sibling::td)[1]/descendant::tr[position()>1]'):
            item = {}
            item['election_year'] = self.election_year
            item['id'] = '%s-%s-%s' % (node.xpath('td[3]/b/text()').extract_first(), node.xpath('td[1]/text()').extract_first(), node.xpath('td[2]/text()').extract_first(), )
            item['abstract'] = node.xpath('td[3]/text()').extract_first()
            item['proposed_by'] = re.sub(u'(副?議長|議員)', '', response.xpath(u'//td[re:test(., "議員:")]')[-1].xpath('text()').extract_first().strip(u'議員:').strip()).strip().split(u'、')
            resolution = (node.xpath('td[4]/text()').extract_first() or '').strip()
            if resolution:
                item['motions'] = [dict(zip(['motion', 'resolution', 'date'], [u'決議', resolution, None]))]
            item['links'] = [
                {
                    'url': response.url,
                    'note': 'original'
                }
            ]
            yield item
class Spider(scrapy.Spider):
    name = "meeting"
    allowed_domains = ["www.kmc.gov.tw", "ebook.21cms.tw"]
    start_urls = ["http://www.kmc.gov.tw/recorder",]
    download_delay = 0.5
    county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1]
    election_year = common.election_year(county_abbr)
    output_path = common.meeting_minutes_output_path(county_abbr, election_year)
    ads = {
        '2014': u'第十八屆',
        '2009': u'第十七屆'
    }
    ad = ads[election_year]

    def parse(self, response):
        nodes = response.css('.panel-body').xpath(u'descendant::a[re:test(., "%s")]' % self.ad)
        for node in nodes:
            link = node.xpath('@href').extract_first()
            item = {}
            item['election_year'] = self.election_year
            item['sitting'] = node.xpath('text()').extract_first().replace(u'(點擊閱讀)', '').replace('>>', '')
            item['download_url'] = urljoin(response.url, link)
            if re.search('/ebook/', link):
                file_name = '%s.pdf' % (item['sitting'], )
                cmd = 'mkdir -p %s && wget -A pdf -nd -r --no-parent -O "%s%s" "%s"' % (self.output_path, self.output_path, file_name, urljoin(response.url, link))
                retcode = subprocess.call(cmd, shell=True)
                yield item
            else:
                yield response.follow(link, callback=self.parse_iframe, meta={'item': item})

    def parse_iframe(self, response):
        link = response.css('.article-content iframe').xpath('@src').extract_first()
        item = response.meta['item']
        file_name = '%s.pdf' % (item['sitting'], )
        cmd = 'mkdir -p %s && wget -A pdf -nd -r --no-parent -O "%s%s" "%s"' % (self.output_path, self.output_path, file_name, link)
        retcode = subprocess.call(cmd, shell=True)
        yield item
Exemplo n.º 9
0
class Spider(scrapy.Spider):
    name = "bills"
    allowed_domains = [
        "tncc.gov.tw",
    ]
    start_urls = ["http://www.tncc.gov.tw"]
    download_delay = 0.5
    county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1]
    election_year = common.election_year(county_abbr)
    ads = {'2010': 1, '2014': 2, '2018': 3}
    ad = ads[election_year]

    def parse(self, response):
        return response.follow(
            response.xpath(u'//a[re:test(., "^議案資訊$")]/@href').extract_first(),
            callback=self.parse_query)

    def parse_query(self, response):
        for value in response.xpath(
                u'//select[@name="motiondept"]/option[not(@value="")]/@value'
        ).extract():
            payload = {
                'menu1':
                response.xpath(
                    u'//select[@name="menu1"]/option[re:test(., "第\s*%d\s*屆")]/@value'
                    % self.ad).extract_first(),
                'motiondept':
                value
            }
            yield scrapy.FormRequest.from_response(response,
                                                   formdata=payload,
                                                   callback=self.parse_list,
                                                   dont_filter=True,
                                                   dont_click=True,
                                                   headers=common.headers(
                                                       self.county_abbr))

    def parse_list(self, response):
        for link in response.xpath(
                '//table[@id="printa"]/descendant::tr[count(td)>1]/descendant::a/@href'
        ):
            yield response.follow(link, callback=self.parse_profile)
        next_page = response.xpath(
            u'//a[re:test(., "^下一頁$")]/@href').extract_first()
        if next_page:
            yield response.follow(next_page, callback=self.parse_list)

    def parse_profile(self, response):
        item = {}
        item['election_year'] = self.election_year
        item['id'] = re.search('=([^&]+)', response.url).group(1)
        for key, label in [('type', u'提案類別'), ('category', u'審查會別'),
                           ('abstract', u'主旨'), ('description', u'說明'),
                           ('methods', u'辦法'), ('execution', u'辦理情形')]:

            content = response.xpath(
                u'string((//*[re:test(., "^%s$")]/following-sibling::td)[1])' %
                label).extract_first()
            if content:
                item[key] = content.strip()
        item['proposed_by'] = re.split(
            u'[\s、,.]',
            re.sub(
                u'(副?議長|議員)', '',
                response.xpath(
                    u'(//*[re:test(., "^提案單位/人$")]/following-sibling::td)[1]/text()'
                ).extract_first()).strip())
        item['petitioned_by'] = re.split(
            u'[\s、,.]',
            re.sub(u'(副?議長|議員)', '', (response.xpath(
                u'(//*[re:test(., "^連署人$")]/following-sibling::td)[1]/text()').
                                      extract_first() or '')).strip())
        item['motions'] = []
        for date, motion in [
            (u'來文日期', u'來文字號'),
            (None, u'審查意見'),
            (u'決議日期', u'大會決議'),
            (u'發文日期', u'發文字號'),
        ]:
            date = response.xpath(
                u'(//*[re:test(., "%s")]/following-sibling::td)[1]/text()' %
                u'[\s ]*'.join(date)).extract_first() if date else None
            resolution = response.xpath(
                u'(//*[re:test(., "%s")]/following-sibling::td)[1]/text()' %
                u'[\s ]*'.join(motion)).extract_first()
            if resolution:
                item['motions'].append(
                    dict(
                        zip(['motion', 'resolution', 'date'],
                            [motion, resolution.strip(), date])))
        item['links'] = [{'url': response.url, 'note': 'original'}]
        for link in response.xpath(
                u'(//*[re:test(., "^議會附件")]/following-sibling::td)[1]/descendant::a/@href'
        ).extract():
            item['links'].append({
                'url': urljoin(response.url, link),
                'note': 'attach'
            })
        yield item
Exemplo n.º 10
0
class Spider(scrapy.Spider):
    name = "bills"
    allowed_domains = ["www.hcc.gov.tw"]
    start_urls = [
        "https://www.hcc.gov.tw/",
    ]
    download_delay = 1
    county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1]
    election_year = common.election_year(county_abbr)
    election_year_map = {
        "2005": {
            "start": datetime(2006, 3, 1),
            "end": datetime(2010, 3, 1)
        },
        "2009": {
            "start": datetime(2010, 3, 1),
            "end": datetime(2014, 12, 25)
        },
        "2014": {
            "start": datetime(2014, 12, 25),
            "end": datetime(2018, 12, 25),
        },
        "2018": {
            "start": datetime(2018, 12, 25),
            "end": datetime(2022, 12, 25),
        },
    }
    term_range = election_year_map[election_year]

    def parse(self, response):
        for t in [u'議員提案', u'縣府提案']:
            yield response.follow(re.sub(
                '^\.\.', '',
                response.xpath(u'//a[@title="%s"]/@href' % t).extract_first()),
                                  callback=self.parse_page,
                                  meta={'type': t})

    def parse_page(self, response):
        bill_type = response.meta['type']
        for node in response.css('.list--table ul:not(:first-child)'):
            date = datetime.strptime(
                node.css('.date-list::text').extract_first(), '%Y-%m-%d')
            if date < self.term_range['start']:
                raise scrapy.exceptions.CloseSpider('out of date range')
            if date > self.term_range['end']:  # continue to next page
                break
            item = {}
            item['election_year'] = self.election_year
            item['type'] = bill_type
            item['category'] = node.css(
                u'[data-th*="類別:"]::text').extract_first()
            link = node.css('.more-list a::attr(href)').extract_first()
            item['id'] = 'gov-%s' % link.split('=')[-1].zfill(
                6) if bill_type == u'縣府提案' else link.split('=')[-1].zfill(6)
            item['abstract'] = node.css(
                u'[data-th*="案由:"]::text').extract_first()
            item['proposed_by'] = (
                node.css(u'[data-th*="提案人:"]::text').extract_first()
                or '').split()
            item['petitioned_by'] = (
                node.css(u'[data-th*="聯署人:"]::text').extract_first()
                or '').split()
            yield response.follow(
                node.css('.more-list a::attr(href)').extract_first(),
                callback=self.parse_detail,
                meta={'item': item})
        if response.css('a.pager.pager-next[href]').extract():
            yield response.follow(response.css(
                'a.pager.pager-next[href]::attr(href)').extract_first(),
                                  callback=self.parse_page,
                                  meta={'type': bill_type})

    def parse_detail(self, response):
        item = response.meta['item']
        item['links'] = [{'url': response.url, 'note': 'original'}]
        for link in response.css(
                '.list--none.actions a::attr(href)').extract():
            item['links'].append({
                'url': urljoin(response.url, link),
                'note': 'attach'
            })
        return item
Exemplo n.º 11
0
class Spider(scrapy.Spider):
    name = "bills"
    allowed_domains = ["ilcc.gov.tw", ]
    start_urls = ["http://www.ilcc.gov.tw"]
    download_delay = 0.5
    county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1]
    election_year = common.election_year(county_abbr)
    ads = {'1998': 14, '2002': 15, '2005': 16, '2009': 17, '2014': 18, '2018': 19}
    ad = ads[election_year]

    def parse(self, response):
        return response.follow(response.xpath(u'//frame[@name="leftFrame"]/@src').extract_first(), callback=self.parse_tab_frame)

    def parse_tab_frame(self, response):
        return response.follow(response.xpath(u'//a[@title="議案資料庫"]/@href').extract_first(), callback=self.parse_frame)

    def parse_frame(self, response):
        return response.follow(response.xpath(u'//frame[@name="mainFrame"]/@src').extract_first(), callback=self.parse_query)

    def parse_query(self, response):
        yield scrapy.FormRequest.from_response(response, callback=self.parse_list, dont_filter=True)

    def parse_list(self, response):
        for node in response.css('table#dg tr')[1:]:
            item = {}
            item['id'] = re.search(u'Fmotion_instanceOS=([^&]*)', node.xpath('td[1]/descendant::a/@href').extract_first()).group(1)
            yield response.follow(node.xpath('td[1]/descendant::a/@href').extract_first(), callback=self.parse_profile, meta={'item': item})
        next_page = response.xpath(u'//a[re:test(.,"下一頁")]/@href').extract_first()
        has_next_page = response.xpath(u'//select[@name="page"]/option[@selected]/following-sibling::option').extract()
        if next_page and has_next_page:
            payload = {'__EVENTTARGET': re.search("doPostBack\('([^']*)'", next_page).group(1)}
            yield scrapy.FormRequest.from_response(response, formdata=payload, callback=self.parse_list, dont_filter=True, dont_click=True, headers=common.headers(self.county_abbr))

    def parse_profile(self, response):
        item = response.meta['item']
        item_ad = response.css(u'#lbFmotion_expireb::text').extract_first()
        for election_year, ad in self.ads.items():
            if int(item_ad) == ad:
                item['election_year'] = election_year
                break
        if item['election_year'] != self.election_year:
            return
        for key, label in [('bill_id', u'lbFmotion_No'), ('type', u'lbFmotion_Category'), ('category', u'lbFmotion_Class'), ('abstract', u'lbFmotion_From'), ('description', u'lbFmotion_Reason'), ('methods', u'lbFmotion_Way')]:
            content = response.css(u'#%s::text' % label).extract_first()
            if content:
                item[key] = content.strip()
        item['proposed_by'] = re.split(u'\s|、', re.sub(u'(副?議長|議員)', '', response.css(u'#lbFmotion_People::text').extract_first()).strip())
        item['petitioned_by'] = re.split(u'\s|、', re.sub(u'(副?議長|議員)', '', (response.css(u'#lbFmotion_AddTo::text').extract_first() or '')).strip())
        item['motions'] = []
        for motion, label in [(u'大會審議', 'lbFmotion_0'), (u'程序會審定', 'lbFmotion_v'), (u'大會決定', 'lbFmotion_1'), (u'分組審查', 'lbFmotion_g'), (u'大會決議', 'lbFmotion_2')]:
            date = response.css(u'#%sdate::text' % label).extract_first()
            resolution = response.css(u'#%sopinion::text' % label).extract_first()
            if date and resolution:
                item['motions'].append(dict(zip(['motion', 'resolution', 'date'], [motion, resolution.strip(), common.ROC2AD(date)])))
        item['links'] = [
            {
                'url': response.url,
                'note': 'original'
            }
        ]
        return item
Exemplo n.º 12
0
class Spider(scrapy.Spider):
    name = "bills"
    allowed_domains = [
        "kmcc.gov.tw",
    ]
    start_urls = ["http://www.kmcc.gov.tw"]
    download_delay = 3
    county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1]
    election_year = common.election_year(county_abbr)
    ads = {'2009': 5, '2014': 6, '2018': 7}
    ad = ads[election_year]

    def parse(self, response):
        return response.follow(response.xpath(
            u'//a[re:test(., "^議事錄查詢$")]/@href').extract_first(),
                               callback=self.parse_query)

    def parse_query(self, response):
        for bill_type in response.xpath(
                u'//select[@name="Type"]/option[re:test(., "(提案|請願)")]/@value'
        ).extract():
            for council in response.xpath(
                    u'//select[@name="Council"]/option/@value').extract():
                payload = {'Type': bill_type, 'Council': council}
                yield scrapy.FormRequest.from_response(
                    response,
                    formdata=payload,
                    callback=self.parse_list,
                    dont_filter=True,
                    headers=common.headers(self.county_abbr))

    def parse_list(self, response):
        for link in response.css(
                '.GridItem a::attr(href),.GridAlternatingItem a::attr(href)'
        ).extract():
            time.sleep(60)
            yield response.follow(link, callback=self.parse_profile)
        next_page = response.css(
            u'.GridPager span ~ a::attr(href)').extract_first()
        if next_page:
            payload = {
                '__EVENTTARGET':
                re.search("doPostBack\('([^']*)'", next_page).group(1)
            }
            yield scrapy.FormRequest.from_response(response,
                                                   formdata=payload,
                                                   callback=self.parse_list,
                                                   dont_filter=True,
                                                   dont_click=True,
                                                   headers=common.headers(
                                                       self.county_abbr))

    def parse_profile(self, response):
        item = {}
        item['election_year'] = self.election_year
        item['id'] = re.search('=([^&]*)', response.url).group(1).zfill(6)
        for key, label in [('type', u'Type'), ('category', u'Kind'),
                           ('abstract', u'CasePoint'),
                           ('description', u'CaseExplain'),
                           ('methods', u'CaseMethod')]:
            content = response.css(u'#%s::text' % label).extract_first()
            if content:
                item[key] = content.strip()
        item['proposed_by'] = re.sub(
            u'(副?議長|議員)', '',
            response.css(u'#CaseUnit::text').extract_first()).strip().split(
                u'、')
        item['motions'] = []
        for motion, label in [(u'審查意見', 'CaseOpinion'),
                              (u'大會決議', 'Resolution')]:
            resolution = response.css(u'#%s::text' % label).extract_first()
            if resolution:
                item['motions'].append(
                    dict(
                        zip(['motion', 'resolution', 'date'],
                            [motion, resolution.strip(), None])))
        item['links'] = [{'url': response.url, 'note': 'original'}]
        return item
Exemplo n.º 13
0
class Spider(scrapy.Spider):
    name = "bills"
    allowed_domains = ["www.tcc.gov.tw", "tccmis.tcc.gov.tw"]
    start_urls = ["http://www.tcc.gov.tw"]
    download_delay = 0.5
    county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1]
    election_year = common.election_year(county_abbr)
    ads = {
        '1969': 1,
        '1973': 2,
        '1977': 3,
        '1981': 4,
        '1985': 5,
        '1989': 6,
        '1994': 7,
        '1998': 8,
        '2002': 9,
        '2006': 10,
        '2010': 11,
        '2014': 12,
        '2018': 13
    }
    ad = ads[election_year]

    def parse(self, response):
        return response.follow(response.xpath(
            u'//a[re:test(., "^議事資訊系統$")]/@href').extract_first(),
                               callback=self.parse_frame)

    def parse_frame(self, response):
        return response.follow(
            response.xpath('//frame[@name="Search"]/@src').extract_first(),
            callback=self.parse_form)

    def parse_form(self, response):
        return scrapy.FormRequest.from_response(response,
                                                formname='OMForm',
                                                formdata={
                                                    'OmasDetr': str(self.ad),
                                                    'rdoDE': '0'
                                                },
                                                callback=self.parse_post)

    def parse_post(self, response):
        for node in response.xpath('//tr[@id="tr"]'):
            item = {}
            td = node.xpath('td/text()').extract()
            item['election_year'] = self.election_year
            item['id'] = td[1]
            item['bill_no'] = td[2]
            item['type'] = re.sub('\s', '', td[3])
            item['category'] = td[4]
            yield scrapy.Request(
                "http://tccmis.tcc.gov.tw/OM/OM_SearchDetail.asp?sys_no=%s" %
                item['id'],
                callback=self.parse_profile,
                meta={'item': item})

    def parse_profile(self, response):
        item = response.meta['item']
        nodes = response.xpath('//div[@id="detail"]/table/tr')
        motions, committee_motion, council_motion = [], {}, {}
        for node in nodes:
            if node.xpath('td/text()')[0].re(u'目前處理程序'):
                item['last_action'] = node.xpath('td/text()').extract()[1]
            elif node.xpath('td/text()')[0].re(u'案由'):
                item['abstract'] = node.xpath('td/text()').extract()[1]
            elif node.xpath('td/text()')[0].re(u'提案人'):
                item['proposed_by'] = node.xpath(
                    'td/div/text()').extract()[0].strip().split(u'、')
            elif node.xpath('td/text()')[0].re(u'召集人/委員'):
                item['proposed_by'] = node.xpath(
                    'td/text()').extract()[1].strip().split(u'、')
            elif node.xpath('td/text()')[0].re(u'議決會次'):
                council_motion['motion'] = u'大會議決'
                council_motion['date'] = common.ROC2AD(
                    node.xpath('td/text()').extract()[1].split()[0])
                council_motion['sitting'] = ''.join(
                    node.xpath('td/text()').extract()[1].split()[1:])
            elif node.xpath('td/text()')[0].re(u'議決文'):
                council_motion['resolution'] = node.xpath(
                    'td/text()').extract()[1]
            elif node.xpath('td/text()')[0].re(u'案(\s| )+?號'):
                item['bill_no'] = node.xpath('td/text()').extract()[1].strip()
            elif node.xpath('td/text()')[0].re(u'來文文號'):
                td = node.xpath('td/text()').extract()[1].split()
                d = dict(
                    zip(['motion', 'resolution', 'date'],
                        [u'來文', None, common.ROC2AD(td[0])]))
                if len(td) > 1:
                    d['no'] = td[1]
                motions.append(d)
            elif node.xpath('td/text()')[0].re(u'收文日期'):
                motions.append(
                    dict(
                        zip(['motion', 'resolution', 'date'], [
                            u'收文', None,
                            common.ROC2AD(
                                node.xpath('td/text()').extract()[1])
                        ])))
            elif node.xpath('td/text()')[0].re(u'審查日期'):
                committee_motion['motion'] = u'委員會審查意見'
                committee_motion['date'] = common.ROC2AD(
                    node.xpath('td/text()').extract()[1])
            elif node.xpath('td/text()')[0].re(u'審查意見'):
                committee_motion['resolution'] = '\n'.join(
                    node.xpath('td/text()').extract()[1:])
            elif node.xpath('td/text()')[0].re(u'發文文號'):
                td = node.xpath('td/text()').extract()[1].split()
                d = dict(
                    zip(['motion', 'resolution', 'date'],
                        [u'發文', None, common.ROC2AD(td[0])]))
                if len(td) > 1:
                    d['no'] = td[1]
                motions.append(d)
            elif node.xpath('td/text()')[0].re(u'執行情形'):
                item['execution'] = node.xpath('td/text()').extract()[1]
            elif node.xpath('td/text()')[0].re(u'備[\s]*?註'):
                item['remark'] = '\n'.join(
                    node.xpath('td/text()').extract()[1:])
        for motion in [committee_motion, council_motion]:
            if motion:
                motions.append(motion)
        item['motions'] = sorted(motions,
                                 key=lambda x: x.get('date'),
                                 reverse=True)
        item['links'] = [{'url': response.url, 'note': 'original'}]
        return item
Exemplo n.º 14
0
class Spider(scrapy.Spider):
    name = "meeting"
    allowed_domains = ["www.ilcc.gov.tw"]
    start_urls = [
        "http://www.ilcc.gov.tw/Html/H_06/H_06.asp",
    ]
    download_delay = 0.5
    county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1]
    election_year = common.election_year(county_abbr)
    output_path = common.meeting_minutes_output_path(county_abbr,
                                                     election_year)
    payload = {
        'ddlcounciltype': u'大會',
    }

    def parse(self, response):
        return response.follow(
            response.xpath(u'//area[@alt="會議紀錄"]/@href').extract_first(),
            callback=self.parse_frame)

    def parse_frame(self, response):
        return response.follow(
            response.xpath('//frame[@id="FrMain"]/@src').extract_first(),
            callback=self.parse_meeting_info)

    def parse_meeting_info(self, response):
        return scrapy.FormRequest.from_response(response,
                                                response.url,
                                                formdata=self.payload,
                                                callback=self.parse_pages)

    def parse_pages(self, response):
        pages = response.xpath(
            '//select[@name="page"]/option/@value').extract()
        for page in pages:
            yield scrapy.FormRequest.from_response(response,
                                                   response.url,
                                                   formdata={
                                                       'page': page,
                                                       'btSearch': None
                                                   },
                                                   callback=self.parse_post)

    def parse_post(self, response):
        trs = response.xpath('//table[@id="dg"]/descendant::tr[position()>1]')
        for tr in trs:
            item = {}
            item['election_year'] = self.election_year
            item['date'] = re.sub('\s', '',
                                  tr.xpath('string(td[1])').extract_first())
            item['sitting'] = re.sub(
                '\s', '', '%s%s' % (tr.xpath('string(td[2])').extract_first(),
                                    tr.xpath('string(td[3])').extract_first()))
            item['meeting'] = re.sub('\s', '',
                                     tr.xpath('string(td[5])').extract_first())
            yield response.follow(
                tr.xpath('td[4]/descendant::a/@href').extract_first(),
                callback=self.parse_profile,
                meta={'item': item})

    def parse_profile(self, response):
        item = response.meta['item']
        item['download_url'] = response.xpath(
            '//td/a[@target="_blank"]/@href').extract_first()
        if item['download_url']:
            ext = re.search(u'\.(\w+)$', item['download_url']).group(1)
            file_name = '%s_%s.%s' % (item['sitting'], item['meeting'], ext)
            cmd = 'mkdir -p %s && wget -c -O %s%s "%s"' % (
                self.output_path, self.output_path, file_name,
                item['download_url'])
            retcode = subprocess.call(cmd, shell=True)
        else:
            logging.error(response.url)
        return item
class Spider(scrapy.Spider):
    name = "meeting"
    allowed_domains = ["www.ntp.gov.tw"]
    start_urls = [
        'https://www.ntp.gov.tw/content/information/information04.aspx'
    ]
    download_delay = 1
    county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1]
    election_year = common.election_year(county_abbr)
    output_path = common.meeting_minutes_output_path(county_abbr,
                                                     election_year)

    def parse(self, response):
        for node in response.xpath(
                u'//a[contains(@title, "HTML檔")]/@href').extract():
            yield response.follow(node, callback=self.parse_sitting)

    def parse_sitting(self, response):
        for node in response.xpath(u'//td/descendant::a/@href').extract():
            yield response.follow(node, callback=self.parse_meeting)

    def parse_meeting(self, response):
        try:
            sitting = response.xpath('//text()').re(u'(.+)日程表')[0]
            trs = [
                tr for tr in response.xpath('//table/descendant::tr')
                if tr.xpath('td[3]/text()').re('\d+')
            ]
            for tr in trs:
                item = {}
                item['election_year'] = self.election_year
                item['date'] = common.ROC2AD(
                    tr.xpath('td[1]/text()').extract_first())
                item['sitting'] = sitting
                item['meeting'] = tr.xpath('td[3]/text()').extract_first()
                item['download_url'] = tr.xpath(
                    'td[6]/descendant::a[1]/@href').extract_first()
                ext = item['download_url'].split('.')[-1]
                file_name = '%s_%s.%s' % (item['sitting'], item['meeting'],
                                          ext)
                if ext == 'pdf':
                    yield response.follow(item['download_url'],
                                          callback=self.download_pdf,
                                          meta={
                                              'item': item,
                                              'file_name': file_name
                                          })
                elif ext == 'htm':
                    yield response.follow(item['download_url'],
                                          callback=self.parse_html,
                                          meta={
                                              'item': item,
                                              'file_name': file_name
                                          })
        except scrapy.exceptions.NotSupported:
            pass

    def download_pdf(self, response):
        item = response.meta['item']
        item['download_url'] = response.url
        cmd = 'mkdir -p %s && wget --no-check-certificate -c -O %s%s "%s"' % (
            self.output_path, self.output_path, response.meta['file_name'],
            item['download_url'])
        retcode = subprocess.call(cmd, shell=True)
        return item

    def parse_html(self, response):
        item = response.meta['item']
        item['download_url'] = response.url
        text = '\n'.join(response.xpath('//pre/text()').extract())
        write_file(
            text, '%s%s_%s.txt' %
            (self.output_path, item['sitting'], item['meeting']))
        return item
Exemplo n.º 16
0
                        ) ___
                        order by role, count desc
                                        ) ____
                group by role
                order by sum desc
                ) _____
                ) row
            ))
        ))
        where uid = %s
	''', [uid, uid])


conn = db_settings.con()
c = conn.cursor()
election_year = common.election_year('')
if len(argv):
    target_county = ast.literal_eval(argv[1])['county']
else:
    target_county = '*'

for f in sorted(glob.glob('../../data/%s/bills-*.json' % target_county)):
    if int(re.search('bills-(\d+).json', f).group(1)) < int(election_year):
        continue
    print f
    county_abbr = f.split('/')[-2]
    f_election_year = re.search('-(\d+)\.json', f).group(1)
    county = common.county_abbr2string(county_abbr)
    county_abbr3 = common.county2abbr3(county)
    dict_list = json.load(open(f))
    for bill in dict_list:
Exemplo n.º 17
0
class Spider(scrapy.Spider):
    name = "bills"
    allowed_domains = [
        "tpa.gov.tw",
    ]
    start_urls = ["http://ylcc.digital.tpa.gov.tw/"]
    download_delay = 0.5
    county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1]
    election_year = common.election_year(county_abbr)
    ads = {
        '2009': u'雲林縣.*?第(十七|17)屆',
        '2014': u'雲林縣.*?第(十八|18)屆',
        '2018': u'雲林縣.*?第(十九|19)屆'
    }
    ad = ads[election_year]

    def parse(self, response):
        return response.follow('/index.php?act=GuestLogin',
                               callback=self.parse_login)

    def parse_login(self, response):
        return response.follow(
            response.xpath(u'//a[re:test(., "^提案$")]/@href').extract_first(),
            callback=self.parse_unordered,
            headers=common.headers(self.county_abbr))

    def parse_unordered(self, response):
        payload = {
            'act': 'search_set',
            'field': 'SET_OrderByMethod',
            'value': 'DESC'
        }
        yield scrapy.FormRequest(response.urljoin('application.php'),
                                 formdata=payload,
                                 callback=self.parse_reload,
                                 meta={'url': response.url},
                                 headers=common.headers(self.county_abbr))

    def parse_reload(self, response):
        return response.follow(response.meta['url'],
                               callback=self.parse_query,
                               dont_filter=True,
                               headers=common.headers(self.county_abbr))

    def parse_query(self, response):
        pages = re.sub(
            '\D', '',
            response.css('.result_select').xpath('string()').extract_first())
        for node in response.css('.result_content'):
            link_node = node.css('.acc_link a')
            if link_node.xpath('text()').re(self.ad):
                item = {}
                item['election_year'] = self.election_year
                link = link_node.xpath('@href').extract_first()
                item['id'] = node.css('.acc_type::text').extract_first().split(
                    '@')[0].strip()
                level = node.xpath(
                    u'string((descendant::span[re:test(., "類別階層")]/following-sibling::span)[1])'
                ).extract_first()
                item['type'], item['category'] = re.search(
                    u'/([^/]+)/?(.*)$', level).groups()
                item['abstract'] = re.sub(
                    '\s', '',
                    node.css('.result_text::text').extract_first())
                yield response.follow(link,
                                      callback=self.parse_profile,
                                      meta={
                                          'item': item,
                                          'handle_httpstatus_list': [302],
                                          'dont_redirect': True
                                      },
                                      headers=common.headers(self.county_abbr))
            else:
                raise scrapy.exceptions.CloseSpider('out of date range')
            time.sleep(.5)
        next_page = response.css(
            '.page_botton.pb_pagedw::attr(href)').extract_first()
        if next_page:
            yield response.follow(next_page, callback=self.parse_query)

    def parse_profile(self, response):
        try:
            payload = {
                'act':
                'act_initial',
                'target':
                re.search('=([^&]*)', response.headers['Location']).group(1),
                'refer':
                'serial'
            }
        except:
            print response.headers
            print response.body
            print response.status
            print response.url
            print 'profile:', response.urljoin(response.headers['Location'])
            raise scrapy.exceptions.CloseSpider('no redirect location')
        yield scrapy.FormRequest(response.urljoin(
            response.headers['Location']),
                                 formdata=payload,
                                 callback=self.parse_post,
                                 meta={'item': response.meta['item']})

    def parse_post(self, response):
        item = response.meta['item']
        try:
            jr = json.loads(response.body_as_unicode())['data']['meta'][0]
        except:
            print 'no json response:', response.url
            raise scrapy.exceptions.CloseSpider('no json response')
        item['proposed_by'] = re.sub(
            u'(副?議長|議員)', '',
            jr.get('Member') or jr.get('Organ') or jr.get('OrganPetiti')
            or jr.get('Chairman') or jr.get('Council')).strip().split(u',')
        if not item['proposed_by'][0]:
            print jr
            raise scrapy.exceptions.CloseSpider('empty proposed_by')
        item['petitioned_by'] = re.sub(
            u'(副?議長|議員)', '', (jr.get('MemberRelated') or jr.get('OrganPetiti')
                               or '')).strip().split(u',')
        item['links'] = [{'url': response.url, 'note': 'original'}]
        return item
class Spider(scrapy.Spider):
    name = "councilors"
    allowed_domains = ["www.kmc.gov.tw"]
    start_urls = [
        "http://www.kmc.gov.tw/",
    ]
    download_delay = 0.5
    county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1]
    election_year = common.election_year(county_abbr)
    county = common.county_abbr2string(county_abbr)

    def __init__(self):
        with open(os.path.join(os.path.dirname(__file__), 'constituency.json'),
                  'r') as infile:
            self.constituency = json.loads(infile.read())
        with open(
                os.path.join(os.path.dirname(__file__),
                             '../../data/cand-moi-county-control-2018.json'),
                'r') as infile:
            self.ref = {
                re.sub(u'[\s ]', '', person['idname']): person
                for person in json.loads(infile.read())
                if person['cityname'] == u'基隆市'
            }

    def parse(self, response):
        return response.follow(
            response.xpath(u'//a[re:test(., "^議員資訊$")]/@href').extract_first(),
            callback=self.parse_list)

    def parse_list(self, response):
        for link in response.css('#speaker a::attr(href)'):
            yield response.follow(link, callback=self.parse_profile)

    def parse_profile(self, response):
        item = {}
        item['election_year'] = self.election_year
        item['county'] = self.county
        item['in_office'] = True
        item['term_start'] = '%s-12-25' % item['election_year']
        item['term_end'] = {'date': '2018-12-24'}
        print response.xpath(
            u'//p/span[re:test(., "\s+副?議(員|長)")]/text()').extract_first()
        item['name'], item['title'] = response.xpath(
            u'//p/span[re:test(., "\s+副?議(員|長)")]/text()').extract_first(
            ).split()
        item['gender'] = self.ref[item['name']]['sex']
        item['constituency'] = response.xpath('//td/text()').re(
            u'選區:\s*(.+)')[0].strip()
        item['district'] = self.constituency[item['constituency']]
        item['image'] = urljoin(
            response.url,
            response.xpath(u'//p/img/@src').extract_first())
        item['links'] = [{'url': response.url, 'note': u'議會個人官網'}]
        item['party'] = response.xpath('//td/text()').re(
            u'政黨:\s*(.+)')[0].strip()
        item['birth'] = common.ROC2AD(
            response.xpath('//td/text()').re(u'出生日期:\s*(.+)')[0])
        website = response.xpath('//td/text()').re(u'網站連結:\s*(.+)')
        if website:
            item['links'].append({'url': website[0].strip(), 'note': u'個人網站'})
        item['contact_details'] = []
        contact_mappings = {
            u'連絡電話': 'voice',
            u'傳真號碼': 'fax',
            u'服務處': 'address',
            u'電子郵件': 'email'
        }
        for label, name in contact_mappings.items():
            values = [
                x.strip() for x in
                response.xpath(u'//td[re:test(., "%s:")]/text()' %
                               '\s*'.join(label)).re(u'%s:\s*(.+)\s*' % label)
                if x.strip()
            ]
            for value in values:
                item['contact_details'].append({
                    'label': label,
                    'type': name,
                    'value': value
                })
        item['experience'] = [
            x.strip()
            for x in response.xpath(u'//img[contains(@src, "speaker0")]')
            [1].xpath('ancestor::tr/following-sibling::tr[1]//tr/td[1]/text()'
                      ).extract() if x.strip()
        ]
        item['platform'] = [
            x.strip()
            for x in response.xpath(u'//img[contains(@src, "speaker0")]')
            [2].xpath('ancestor::tr/following-sibling::tr[1]//tr/td[1]/text()'
                      ).extract() if x.strip()
        ]
        yield item
Exemplo n.º 19
0
class Spider(scrapy.Spider):
    name = "bills"
    handle_httpstatus_list = [302]
    allowed_domains = ["kcc.gov.tw"]
    start_urls = ["http://www.kcc.gov.tw",]
    download_delay = 0.5
    county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1]
    election_year = common.election_year(county_abbr)
    ads = {'2010': u'一', '2014': u'二', '2018': u'三'}
    ad = ads[election_year]

    def parse(self, response):
        return response.follow(response.xpath(u'//a[re:test(., "^大會提案$")]/@href').extract_first(), callback=self.parse_query)

    def parse_query(self, response):
        payload = {
            'ctl00$ContentPlaceHolder1$uscPeriodSessionMeeting$ddlSession': response.xpath(u'//select[@name="ctl00$ContentPlaceHolder1$uscPeriodSessionMeeting$ddlSession"]/option[re:test(., "%s屆")]/@value' % self.ad).extract_first(),
            'ctl00$ContentPlaceHolder1$uscPeriodSessionMeeting$ddlMeeting': '',
            '__EVENTTARGET': re.search("__doPostBack\('([^']*)", response.css('#ContentPlaceHolder1_LinkButton1::attr(href)').extract_first()).group(1)
        }
        yield scrapy.FormRequest.from_response(response, formdata=payload, callback=self.parse_type, dont_filter=True, dont_click=True, headers=common.headers(self.county_abbr))

    def parse_type(self, response):
        tabs = response.xpath('//div[@id="tabs"]/ul/li/a')
        for i, tab in enumerate(tabs, 1):
            type, count = tab.xpath('text()').extract()
            count = re.sub('\D', '', count)
            if count:
                payload = {"ctl00$ContentPlaceHolder1$DataPager%d$ctl02$txtPageSize" % i: count}
                if i != 1:
                    payload["ctl00$ContentPlaceHolder1$btnGo%d" % i] = " Go "
                else:
                    payload["ctl00$ContentPlaceHolder1$btnGo"] = " Go "
                yield scrapy.FormRequest.from_response(response, formdata=payload, callback=self.parse_tab, dont_filter=True, meta={'type': tab.xpath('text()').extract_first().strip(), 'tab_id': 'tabs-%d' % i})

    def parse_tab(self, response):
        trs = response.xpath('//div[@id="%s"]/div/table/tr[count(td)>1]' % response.meta['tab_id'])
        for tr in trs:
            item = {}
            item['election_year'] = self.election_year
            item['type'] = response.meta['type']
            item['last_action'] = tr.xpath('td[6]/text()').extract_first()
            link = tr.xpath('td[@onclick]/@onclick').re(u"\.href='([^']+)'")[0]
            yield response.follow(link, callback=self.parse_profile, meta={'dont_redirect': True, 'item': item})

    def parse_profile(self, response):
        item = response.meta['item']
        item['id'] = '-'.join(re.findall(u'=([^&]*)', response.url))
        for key, label in [('category', u'類別'), ('abstract', u'案由'), ('description', u'說明'), ('methods', u'辦法'), ('remark', u'備註'), ]:
            content = response.xpath(u'string((//td[re:test(., "%s")]/following-sibling::td)[1])' % label).extract_first()
            if content:
                item[key] = content.strip()
        item['proposed_by'] = re.split(u'\s|、', re.sub(u'(副?議長|議員)', '', u'、'.join([x.strip() for x in response.xpath(u'(//td[re:test(., "提案(人|單位)")]/following-sibling::td)[1]/text()').extract()])))
        item['petitioned_by'] = re.split(u'\s|、', re.sub(u'(副?議長|議員)', '', u'、'.join([x.strip() for x in (response.xpath(u'(//td[re:test(., "連署人")]/following-sibling::td)[1]/text()').extract() or [])])))
        item['motions'] = []
        for motion in [u'一讀', u'委員會審查意見', u'二讀決議', u'三讀決議', ]:
            date = common.ROC2AD(''.join(response.xpath(u'(//td[re:test(., "%s")]/following-sibling::td)[1]/span/text()' % motion).extract()))
            resolution = ''.join([x.strip() for x in response.xpath(u'(//td[re:test(., "%s")]/following-sibling::td)[1]/text()' % motion).extract()])
            if date or resolution:
                item['motions'].append(dict(zip(['motion', 'resolution', 'date'], [motion, resolution, date])))
        item['links'] = [
            {
                'url': response.url,
                'note': 'original'
            }
        ]
        return item
Exemplo n.º 20
0
class Spider(scrapy.Spider):
    name = "bills"
    allowed_domains = ["cyscc.gov.tw", ]
    start_urls = ["http://www.cyscc.gov.tw"]
    download_delay = 0.5
    county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1]
    election_year = common.election_year(county_abbr)
    ads = {'2009': 17, '2014': 18, '2018': 19}
    ad = ads[election_year]

    def parse(self, response):
        return response.follow(response.xpath(u'//a[re:test(., "^議會資料庫$")]/@href').extract_first(), callback=self.parse_tab)

    def parse_tab(self, response):
        return response.follow(response.xpath(u'//a[re:test(., "^議會資料庫查詢系統$")]/@href').extract_first(), callback=self.parse_query)

    def parse_query(self, response):
        for value in response.xpath(u'//input[@name="ctl00$ContentPlaceHolder1$rbtnMKind"]/@value').extract():
            payload = {'ctl00$ContentPlaceHolder1$rbtnMKind': value}
            yield scrapy.FormRequest.from_response(response, formdata=payload, callback=self.parse_list, dont_filter=True, dont_click=True, headers=common.headers(self.county_abbr))

    def parse_list(self, response):
        pages = response.css('#ctl00_ContentPlaceHolder1_gvIndex_ctl13_lblPageCount::text').extract_first()
        print pages
        for node in response.css('.main3_3_04,.main3_3_05'):
            node_ad = int(node.xpath('td[2]/text()').re(u'(\d+)\s*屆')[0])
            if node_ad < self.ad:
                break
            if node_ad > self.ad:
                continue
            yield response.follow(node.xpath('td[6]/span/a/@href').extract_first(), callback=self.parse_profile)
        next_page = response.xpath(u'//a[re:test(.,"下一頁")]/@href').extract_first()
        if next_page and node_ad >= self.ad:
            payload = {'__EVENTTARGET': re.search("doPostBack\('([^']*)'", next_page).group(1)}
            yield scrapy.FormRequest.from_response(response, formdata=payload, callback=self.parse_list, dont_filter=True, dont_click=True, headers=common.headers(self.county_abbr))

    def parse_profile(self, response):
        item = {}
        item['election_year'] = self.election_year
        item['id'] = re.search('=([^&]*)', response.url).group(1).zfill(6)
        for key, label in [('type', u'提案類別'), ('category', u'類別'), ('abstract', u'案由'), ('description', u'說明'), ('methods', u'辦法')]:
                content = response.xpath(u'(//td[re:test(., "^%s$")]/following-sibling::td)[1]/span/text()' % label).extract_first()
                if content:
                    item[key] = content.strip()
        item['proposed_by'] = re.sub(u'(副?議長|議員)', '', response.xpath(u'(//td[re:test(., "^提\s*案\s*人$")]/following-sibling::td)[1]/span/text()').extract_first()).strip().split(u'、')
        item['petitioned_by'] = re.sub(u'(副?議長|議員)', '', (response.xpath(u'(//td[re:test(., "^連\s*署\s*人$")]/following-sibling::td)[1]/span/text()').extract_first() or '')).strip().split(u'、')
        item['motions'] = []
        for motion in [u'審查意見', u'大會決議']:
            resolution = response.xpath(u'(//td[re:test(., "^%s$")]/following-sibling::td)[1]/span/text()' % motion).extract_first()
            if resolution:
                item['motions'].append(dict(zip(['motion', 'resolution', 'date'], [motion, resolution.strip(), None])))
        item['links'] = [
            {
                'url': response.url,
                'note': 'original'
            }
        ]
        for link in response.css('#ctl00_ContentPlaceHolder1_fvDetail_dlRelFile a::attr(href)').extract():
            item['links'].append(
                {
                    'url': urljoin(response.url, link),
                    'note': 'attach'
                }
            )
        return item
Exemplo n.º 21
0
class Spider(scrapy.Spider):
    name = "bills"
    allowed_domains = [
        "cycc.gov.tw",
    ]
    start_urls = ["http://www.cycc.gov.tw/index2.asp"]
    download_delay = 0.5
    county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1]
    election_year = common.election_year(county_abbr)
    ads = {'2009': 17, '2014': 18, '2018': 19}
    ad = ads[election_year]

    def parse(self, response):
        return response.follow(
            response.xpath(u'//a[img[@alt="議案查詢"]]/@href').extract_first(),
            callback=self.parse_tab)

    def parse_tab(self, response):
        return response.follow(response.xpath(
            u'//a[re:test(., "^議決案檢索$")]/@href').extract_first(),
                               callback=self.parse_query)

    def parse_query(self, response):
        for value in response.xpath(
                u'//select[@name="sid"]/option/@value').extract():
            payload = {'sid': value}
            yield scrapy.FormRequest.from_response(response,
                                                   formdata=payload,
                                                   callback=self.parse_list,
                                                   dont_filter=True,
                                                   dont_click=True,
                                                   headers=common.headers(
                                                       self.county_abbr))

    def parse_list(self, response):
        for i, node in enumerate(
                response.xpath('//table[@bgcolor][not(caption)]')):
            item = {}
            item['election_year'] = self.election_year
            item['id'] = '%s-%02d' % ('-'.join(
                re.sub('\D', ' ', response.url).split()), i)
            for key, label in [('category', u'類[\s ]*別'),
                               ('abstract', u'案[\s ]*由'),
                               ('description', u'理[\s ]*由'),
                               ('methods', u'辦[\s ]*法')]:
                content = response.xpath(
                    u'(//*[re:test(., "%s")]/following-sibling::td)[1]/span/text()'
                    % label).extract_first()
                if content:
                    item[key] = content.strip()
            item['proposed_by'] = re.sub(
                u'(副?議長|議員)', '',
                response.xpath(
                    u'(//*[re:test(., "提[\s ]*案[\s ]*人")]/following-sibling::td)[1]/span/text()'
                ).extract_first()).strip().split(u'、')
            item['petitioned_by'] = re.sub(u'(副?議長|議員)', '', (response.xpath(
                u'(//*[re:test(., "連[\s ]*署[\s ]*人")]/following-sibling::td)[1]/span/text()'
            ).extract_first() or '')).strip().split(u'、')
            item['motions'] = []
            for motion in [u'審查意見', u'決議']:
                resolution = response.xpath(
                    u'(//*[re:test(., "%s")]/following-sibling::td)[1]/span/text()'
                    % u'[\s ]*'.join(motion)).extract_first()
                if resolution:
                    item['motions'].append(
                        dict(
                            zip(['motion', 'resolution', 'date'],
                                [motion, resolution.strip(), None])))
            item['links'] = [{'url': response.url, 'note': 'original'}]
            yield item
        next_page = response.xpath(
            u'//a[img[@alt="下一頁"]]/@href').extract_first()
        if next_page:
            yield response.follow(next_page, callback=self.parse_list)
Exemplo n.º 22
0
class Spider(scrapy.Spider):
    name = "meeting"
    allowed_domains = ["obas_front.tcc.gov.tw"]
    start_urls = [
        "http://obas_front.tcc.gov.tw:8080/Agenda/EFileSearch.aspx?FileGrpKind=2&h=600",
    ]
    download_delay = 0.5
    county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1]
    election_year = common.election_year(county_abbr)
    output_path = common.meeting_minutes_output_path(county_abbr,
                                                     election_year)
    payload = {
        'btnCongress': u'大會',
        'txtPageSize': u'300',
    }

    def parse(self, response):
        return scrapy.FormRequest.from_response(response,
                                                response.url,
                                                formdata=self.payload,
                                                callback=self.parse_post)

    def parse_post(self, response):
        links = response.xpath(
            '//table/tr/td/a[contains(@href, "EFileDetail.aspx")]/@href'
        ).extract()
        for link in links:
            yield response.follow(link, callback=self.parse_profile)

    def parse_profile(self, response):
        item = {}
        item['election_year'] = self.election_year
        nodes = response.xpath('//table/tbody/tr')
        ref = {
            u'屆別': {
                'key': 'sitting',
                'path': 'td/span/text()'
            },
            u'類別': {
                'key': 'category',
                'path': 'td/span/text()'
            },
            u'日期': {
                'key': 'date',
                'path': 'td/span/text()'
            },
            u'資料名稱': {
                'key': 'meeting',
                'path': 'td/span/text()'
            },
            u'檔案': {
                'key': 'download_url',
                'path': 'td/a/@href',
                'extra': 'http://obas_front.tcc.gov.tw:8080/Agenda/'
            },
        }
        for node in nodes:
            value = ref.get(node.xpath('th/text()').extract_first().strip())
            if value:
                item[value['key']] = '%s%s' % (value.get(
                    'extra', ''), node.xpath(value['path']).extract_first())
        item['date'] = common.ROC2AD(item['date'])
        ext = re.search(u'FileName=[\w\d]+\.(\w+)&',
                        item['download_url']).group(1)
        file_name = '%s_%s.%s' % (item['sitting'], item['meeting'], ext)
        cmd = 'mkdir -p %s && wget -c -O %s%s "%s"' % (
            self.output_path, self.output_path, file_name,
            item['download_url'])
        retcode = subprocess.call(cmd, shell=True)
        return item
Exemplo n.º 23
0
    2: '1973',
    3: '1977',
    4: '1981',
    5: '1985',
    6: '1989',
    7: '1994',
    8: '1998',
    9: '2002',
    10: '2006',
    11: '2010',
    12: '2014',
    13: '2018'
}
county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1]
county = common.county_abbr2string(county_abbr)
election_year = common.election_year(county)
county_abbr3 = common.county2abbr3(county)
total_text = codecs.open(
    u"../../../data/tcc/meeting_minutes-%s.txt" % election_year, "r",
    "utf-8").read()

Session_Token = re.compile(
    u'''
    \s*
    (?P<name>
        %s議會
        第\s*(?P<ad>[\d]+)\s*屆
        第\s*(?P<session>[\d]+)\s*次(?P<type>(定期|臨時))大會
        (預備會議暨)?
        第\s*(?P<times>[\d]+)\s*次
        會議
Exemplo n.º 24
0
class Spider(scrapy.Spider):
    name = "bills"
    allowed_domains = [
        "phcouncil.gov.tw",
    ]
    start_urls = ["http://www.phcouncil.gov.tw/"]
    download_delay = 1
    county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1]
    election_year = common.election_year(county_abbr)
    ads = {'2009': 17, '2014': 18, '2018': 19}
    ad = ads[election_year]

    def parse(self, response):
        for node in response.xpath(
                u'((//a[re:test(., "^議會相關法案$")]/following-sibling::ul)[1]/descendant::*[re:test(., "第\s*%s\s*屆")]/following-sibling::ul)[1]/descendant::a'
                % self.ad):
            yield response.follow(
                node.xpath('@href').extract_first(),
                callback=self.parse_query,
                meta={'type': node.xpath('text()').extract_first()})

    def parse_query(self, response):
        for link in response.css(u'#Main_Table').xpath(
                u'descendant::a[re:test(., "詳細內容")]/@href').extract():
            yield response.follow(link,
                                  callback=self.parse_list,
                                  meta={'type': response.meta['type']})
        next_page = response.xpath(
            u'//a[img[@alt="下一頁"]]/@href').extract_first()
        if next_page:
            yield response.follow(next_page,
                                  callback=self.parse_query,
                                  meta={'type': response.meta['type']})

    def parse_list(self, response):
        for link in response.css(u'#Main_Table').xpath(
                u'descendant::a[re:test(., "詳細內容")]/@href').extract():
            yield response.follow(link,
                                  callback=self.parse_profile,
                                  meta={'type': response.meta['type']})
        next_page = response.xpath(
            u'//a[img[@alt="下一頁"]]/@href').extract_first()
        if next_page:
            yield response.follow(next_page,
                                  callback=self.parse_list,
                                  meta={'type': response.meta['type']})

    def parse_profile(self, response):
        item = {}
        item['election_year'] = self.election_year
        item['type'] = response.meta['type']
        item['id'] = '%s-%s' % (self.election_year,
                                re.search(u'id=([^&]*)',
                                          response.url).group(1))
        for key, label in [('category', u'類[\s ]*別'),
                           ('abstract', u'案[\s ]*由'),
                           ('description', u'說[\s ]*明'),
                           ('methods', u'辦[\s ]*法'),
                           ('execution', u'決[\s ]*議'),
                           ('execution', u'議[\s ]*決')]:
            content = response.xpath(
                u'string((//*[re:test(., "%s")]/following-sibling::td)[1])' %
                label).extract_first()
            if content:
                item[key] = content.strip()
        if item['type'] == u'縣府提案':
            item['proposed_by'] = u'縣府'
        else:
            item['proposed_by'] = re.split(
                u'[,、 ]',
                re.sub(
                    u'(副?議長|議員)', '',
                    response.xpath(
                        u'//*[re:test(., "(提[\s ]*案|動[\s ]*議|請[\s ]*願)[\s ]*人")]'
                    )[-1].xpath('following-sibling::td[1]/text()').
                    extract_first()).strip())
        item['petitioned_by'] = re.split(
            u'[,、 ]',
            re.sub(
                u'(副?議長|議員)', '',
                response.xpath(
                    u'//td[re:test(., "(連[\s ]*署|附[\s ]*議)[\s ]*人")]')
                [-1].xpath('following-sibling::td[1]/text()').extract_first()).
            strip()) if len(
                response.xpath(u'//*[re:test(., "(連[\s ]*署|附[\s ]*議)[\s ]*人")]'
                               )) > 0 else ['']
        item['links'] = [{'url': response.url, 'note': 'original'}]
        yield item