def find_pledges(filename, x_pledge, x_content): page = utils.read_webpage(filename) pledges = [list(row.itertext())[0].strip().replace('"', '\'')\ for row in page.xpath(x_pledge)] contents = ['\n'.join(row.itertext()).strip().replace('"', '\'')\ for row in page.xpath(x_content)] return pledges, contents
def fetch_new_bill_ids(assembly_id): directory = DIR['meta'] meta_data = '%s/%d.csv' % (directory, assembly_id) lines = list(open(meta_data, 'r'))[1:] lines = [line.decode('utf-8') for line in lines] existing_ids = set(line.split(',', 1)[0].strip('"') for line in lines) last_proposed_date = max( line.split('","', 6)[5].strip('"') for line in lines) baseurl = BASEURL['list'] url = '%(baseurl)sPROPOSE_FROM=%(last_proposed_date)s&PAGE_SIZE=100' % locals( ) directory = '%s/%s' % (DIR['list'], assembly_id) fn = '%s/tmp.html' % directory utils.get_webpage(url, fn) p = utils.read_webpage(fn) rows = utils.get_elems(p, X['table']) new_bill_ids = [] with open(meta_data, 'a') as f: for r in reversed(rows): columns = r.xpath(X['columns']) if len(columns) == 8: p = parse_columns(columns) if p[0] not in existing_ids: list_to_file(p, f) new_bill_ids.append(p[0]) return new_bill_ids
def fetch_new_bill_ids(assembly_id): directory = DIR['meta'] meta_data = '%s/%d.csv' % (directory, assembly_id) lines = list(open(meta_data, 'r'))[1:] lines = [line.decode('utf-8') for line in lines] existing_ids = set(line.split(',', 1)[0].strip('"') for line in lines) last_proposed_date = max(line.split('","', 6)[5].strip('"') for line in lines) baseurl = BASEURL['list'] page_size = PAGE_SIZE url = '%(baseurl)sPROPOSE_FROM=%(last_proposed_date)s&PAGE_SIZE=%(page_size)d' % locals() directory = '%s/%s' % (DIR['list'], assembly_id) fn = '%s/tmp.html' % directory utils.get_webpage(url, fn) p = utils.read_webpage(fn) rows = utils.get_elems(p, X['table']) new_bill_ids = [] with open(meta_data, 'a') as f: for r in reversed(rows): columns = r.xpath(X['columns']) if len(columns)==8: p = parse_columns(columns) if p[0] not in existing_ids: list_to_file(p, f) new_bill_ids.append(p[0]) return new_bill_ids
def find_people(filename, x_table, x_links): page = utils.read_webpage(filename) table = page.xpath(x_table) names = [list(row.itertext())[9].strip() for row in table] parties = [list(row.itertext())[5].strip() for row in table] birthdays = [list(row.itertext())[19].strip().replace('/', '') for row in table] ids = [re.search(r'[0-9]+', s).group(0) for s in page.xpath(x_links)] #for i, p in enumerate(list(table[1].itertext())): print i, p return names, parties, ids, birthdays
def extract_summaries(assembly_id, bill_id): #TODO: 제안이유 & 주요내용 분리하기 try: fn = '%s/%s/%s.html' % (DIR['summaries'], assembly_id, bill_id) page = utils.read_webpage(fn) summaries = [e.replace('?', '/').strip()\ for e in utils.get_elems(page, X['summary'])] return summaries except IOError as e: return None
def extract_proposers(assembly_id, bill_id): #TODO: 찬성의원 목록에 의원 이름이 있는 경우가 있는자 확인 fn = '%s/%s/%s.html' % (DIR['proposers'], assembly_id, bill_id) page = utils.read_webpage(fn) elems = utils.get_elems(page, X['proposers']) if assembly_id < 19: return elems else: key = ['name_kr', 'party', 'name_cn'] values = [filter(None, re.split('[\(/\)]', e)) for e in elems] return [{k: v for k, v in zip(key, value)} for value in values]
def get_npages(assembly_id): url, directory = convert(assembly_id) utils.check_dir(directory) fn = '%s/tmp.html' % directory utils.get_webpage(url, fn) page = utils.read_webpage(fn) m = re.search(u'총(.+)건', page.xpath('//span[@class="text3"]/text()')[0]) nbills = int(m.group(1)) npages = int(math.ceil(nbills / float(PAGE_SIZE))) print 'Total %d bills, %d pages to %s' % (nbills, npages, directory) return npages
def get_npages(assembly_id): url, directory = convert(assembly_id) utils.check_dir(directory) fn = '%s/tmp.html' % directory utils.get_webpage(url, fn) page = utils.read_webpage(fn) m = re.search(u'총(.+)건', page.xpath('//span[@class="text3"]/text()')[0]) nbills = int(m.group(1)) npages = int(math.ceil(nbills/float(PAGE_SIZE))) print 'Total %d bills, %d pages to %s' % (nbills, npages, directory) return npages
def parse_page(page, f, assembly_id): fn = "%s/%s/%d.html" % (DIR["list"], assembly_id, page) p = utils.read_webpage(fn) rows = utils.get_elems(p, X["table"]) for r in reversed(rows): columns = r.xpath(X["columns"]) if len(columns) == 8: p = parse_columns(columns) list_to_file(p, f) sys.stdout.write("%d\t" % page) sys.stdout.flush()
def parse_page(page, f, assembly_id): fn = '%s/%s/%d.html' % (DIR['list'], assembly_id, page) p = utils.read_webpage(fn) rows = utils.get_elems(p, X['table']) for r in reversed(rows): columns = r.xpath(X['columns']) if len(columns)==8: p = parse_columns(columns) list_to_file(p, f) sys.stdout.write('%d\t' % page) sys.stdout.flush()
def parse_page(page, f, assembly_id): fn = '%s/%s/%d.html' % (DIR['list'], assembly_id, page) p = utils.read_webpage(fn) rows = utils.get_elems(p, X['table']) for r in reversed(rows): columns = r.xpath(X['columns']) if len(columns) == 8: p = parse_columns(columns) list_to_file(p, f) sys.stdout.write('%d\t' % page) sys.stdout.flush()
def extract_specifics(assembly_id, bill_id, meta): def extract_file_links(c): url = c.xpath('descendant::a/@href') i, node = 0, [] elem_node = c.xpath('descendant::node()') for j, n in enumerate(elem_node): if type(n)==lxml.etree._Element: if n.tag=='br': node.append(elem_node[i:j]) i = j links = dict() for n in node: tmp = [] for m in n: if type(m)==lxml.etree._ElementUnicodeResult: desc = m.strip() links[desc] = tmp tmp = [] elif type(m)==lxml.etree._Element and m.tag not in ['img', 'br']: tmp.append(m.xpath('@href')[0]) else: pass return links def extract_meeting_num(c): s = c.xpath('descendant::text()')[0] m = re.search(ur'제(.*)대.*제(.*)회', s) return [int(e) for e in m.groups()] def status_info(es, et, status_en): subjects = es.xpath('text()')[0] headers = [t[1] for t in utils.get_elem_texts(et, 'td')] elem_contents = [c for c in es.xpath(X['timeline']['%s_contents' % status_en]) if type(c)==lxml.etree._Element] elem_rows = [ec.xpath('td') for ec in elem_contents] rows = [] for row in elem_rows: columns = [] for column in row: links = column.xpath('descendant::a') images = column.xpath('descendant::img') if links: columns.append([link.xpath('@href')[0] for link in links]) elif images: parts = re.sub(r'.*\((.*)\)', r'\g<1>',\ images[0].xpath('@onclick')[0])\ .replace(' ', '').replace('\'','')\ .split(',') if parts[1] > 208: url = '%sdata2/%s/pdf/%s' % (parts[0], parts[1], parts[2]) else: url = '%sdata1/%s/%s' % (parts[0], parts[1], parts[2]) columns.append(url) else: columns.append(column.xpath('descendant::text()')[1].strip()) rows.append(dict(zip(headers, columns))) return rows def extract_extra_info(meta, c): extra_infos = dict() current_category = None for node in r: if node.tag == 'span' and node.get('class') == 'text11': current_category = node.text.strip() current_category = '대안반영폐기 의안목록' if current_category.startswith('대안반영폐기 의안목록') else current_category continue if current_category == None: continue extra_infos[current_category] = extra_infos[current_category] if extra_infos.has_key(current_category) else [] content = None if current_category == '비고': content = extract_remark(node) elif current_category == '대안': content = extract_bill_id_from_link(meta, node) elif current_category == '대안반영폐기 의안목록': content = extract_bill_id_from_link(meta, node) else: content = lxml.html.tostring(node) if content: extra_infos[current_category].append(content) return extra_infos def extract_remark(c): try: if c.tag == 'br': return c.tail.strip() else: return c.text.strip() except AttributeError: return None def extract_bill_id_from_link(meta, c): # Assume this is <a> tag href = c.get('href') match = re.match('/bill/jsp/BillDetail.jsp\?bill_id=(.*)', href) if match: return meta.query('link_id == @match.group(1)')['bill_id'].values[0] return None fn = '%s/%d/%s.html' % (DIR['specifics'], assembly_id, bill_id) page = utils.read_webpage(fn) table = utils.get_elems(page, X['spec_table'])[1] timeline = page.xpath(X['spec_timeline'])[0] title = page.xpath(X['spec_title'])[0].strip().replace('"','') status_detail = ' '.join(page.xpath(X['spec_status'])).strip() statuses = filter(None,\ (s.strip() for s in\ ' '.join(\ s for s in timeline.xpath(X['spec_timeline_statuses'])\ if not type(s)==lxml.etree._Element)\ .split('\n'))) status_infos = [filter(None, i.split('*'))\ for i in timeline.xpath(X['spec_timeline_status_infos'])] row_titles = [' '.join(e.xpath('td/text()')).strip()\ for i, e in enumerate(table.xpath('tbody/tr')) if i%4==0] elem_row_contents = [e.xpath('td[@class="text6"]')[0]\ for i, e in enumerate(table.xpath('tbody/tr')) if i%4==1] status_dict = {} for i, r in enumerate(elem_row_contents): if row_titles[i]!='부가정보': status_dict[row_titles[i]] = extract_row_contents(r) else: status_dict[row_titles[i]] = extract_extra_info(meta, r) headers = ['assembly_id', 'bill_id', 'title', 'status_detail', 'statuses', 'status_infos', 'status_dict'] specifics = [assembly_id, bill_id, title, status_detail, statuses, status_infos, status_dict] return dict(zip(headers, specifics))
def extract_specifics(assembly_id, bill_id, meta): def extract_file_links(c): url = c.xpath('descendant::a/@href') i, node = 0, [] elem_node = c.xpath('descendant::node()') for j, n in enumerate(elem_node): if type(n)==lxml.etree._Element: if n.tag=='br': node.append(elem_node[i:j]) i = j links = dict() for n in node: tmp = [] for m in n: if type(m)==lxml.etree._ElementUnicodeResult: desc = m.strip() links[desc] = tmp tmp = [] elif type(m)==lxml.etree._Element and m.tag not in ['img', 'br']: tmp.append(m.xpath('@href')[0]) else: pass return links def extract_meeting_num(c): s = c.xpath('descendant::text()')[0] m = re.search(ur'제(.*)대.*제(.*)회', s) return [int(e) for e in m.groups()] def status_info(es, et, status_en): subjects = es.xpath('text()')[0] headers = [t[1] for t in utils.get_elem_texts(et, 'td')] elem_contents = [c for c in es.xpath(X['timeline']['%s_contents' % status_en]) if type(c)==lxml.etree._Element] elem_rows = [ec.xpath('td') for ec in elem_contents] rows = [] for row in elem_rows: columns = [] for column in row: links = column.xpath('descendant::a') images = column.xpath('descendant::img') if links: columns.append([link.xpath('@href')[0] for link in links]) elif images: parts = re.sub(r'.*\((.*)\)', r'\g<1>',\ images[0].xpath('@onclick')[0])\ .replace(' ', '').replace('\'','')\ .split(',') if parts[1] > 208: url = '%sdata2/%s/pdf/%s' % (parts[0], parts[1], parts[2]) else: url = '%sdata1/%s/%s' % (parts[0], parts[1], parts[2]) columns.append(url) else: columns.append(column.xpath('descendant::text()')[1].strip()) rows.append(dict(zip(headers, columns))) return rows fn = '%s/%d/%s.html' % (DIR['specifics'], assembly_id, bill_id) page = utils.read_webpage(fn) table = utils.get_elems(page, X['spec_table'])[1] timeline = page.xpath(X['spec_timeline'])[0] title = page.xpath(X['spec_title'])[0].strip().replace('"','') status_detail = ' '.join(page.xpath(X['spec_status'])).strip() statuses = filter(None,\ (s.strip() for s in\ ' '.join(\ s for s in timeline.xpath(X['spec_timeline_statuses'])\ if not type(s)==lxml.etree._Element)\ .split('\n'))) status_infos = [filter(None, i.split('*'))\ for i in timeline.xpath(X['spec_timeline_status_infos'])] row_titles = [' '.join(e.xpath('td/text()')).strip()\ for i, e in enumerate(table.xpath('tbody/tr')) if i%4==0] elem_row_contents = [e.xpath('td[@class="text6"]')[0]\ for i, e in enumerate(table.xpath('tbody/tr')) if i%4==1] status_dict = {} for i, r in enumerate(elem_row_contents): if row_titles[i]!='부가정보': status_dict[row_titles[i]] = extract_row_contents(r) else: t = r.xpath('span[@class="text8"]/text()') c = filter(None, (t.strip() for t in r.xpath('text()'))) status_dict[row_titles[i]] = dict(zip(t, c)) headers = ['assembly_id', 'bill_id', 'title', 'status_detail', 'statuses', 'status_infos', 'status_dict'] specifics = [assembly_id, bill_id, title, status_detail, statuses, status_infos, status_dict] return dict(zip(headers, specifics))
def extract_proposers(assembly_id, bill_id): #TODO: 찬성의원 목록에 의원 이름이 있는 경우가 있는자 확인 fn = '%s/%s/%s.html' % (DIR['proposers'], assembly_id, bill_id) page = utils.read_webpage(fn) return utils.get_elems(page, X['proposers'])
def extract_withdrawers(assembly_id, bill_id): fn = '%s/%s/%s.html' % (DIR['withdrawers'], assembly_id, bill_id) page = utils.read_webpage(fn) return utils.get_elems(page, X['withdrawers'])
def extract_specifics(assembly_id, bill_id, meta): def extract_file_links(c): url = c.xpath('descendant::a/@href') i, node = 0, [] elem_node = c.xpath('descendant::node()') for j, n in enumerate(elem_node): if type(n) == lxml.etree._Element: if n.tag == 'br': node.append(elem_node[i:j]) i = j links = dict() for n in node: tmp = [] for m in n: if type(m) == lxml.etree._ElementUnicodeResult: desc = m.strip() links[desc] = tmp tmp = [] elif type(m) == lxml.etree._Element and m.tag not in [ 'img', 'br' ]: tmp.append(m.xpath('@href')[0]) else: pass return links def extract_meeting_num(c): s = c.xpath('descendant::text()')[0] m = re.search(ur'제(.*)대.*제(.*)회', s) return [int(e) for e in m.groups()] def status_info(es, et, status_en): subjects = es.xpath('text()')[0] headers = [t[1] for t in utils.get_elem_texts(et, 'td')] elem_contents = [ c for c in es.xpath(X['timeline']['%s_contents' % status_en]) if type(c) == lxml.etree._Element ] elem_rows = [ec.xpath('td') for ec in elem_contents] rows = [] for row in elem_rows: columns = [] for column in row: links = column.xpath('descendant::a') images = column.xpath('descendant::img') if links: columns.append([link.xpath('@href')[0] for link in links]) elif images: parts = re.sub(r'.*\((.*)\)', r'\g<1>',\ images[0].xpath('@onclick')[0])\ .replace(' ', '').replace('\'','')\ .split(',') if parts[1] > 208: url = '%sdata2/%s/pdf/%s' % (parts[0], parts[1], parts[2]) else: url = '%sdata1/%s/%s' % (parts[0], parts[1], parts[2]) columns.append(url) else: columns.append( column.xpath('descendant::text()')[1].strip()) rows.append(dict(zip(headers, columns))) return rows fn = '%s/%d/%s.html' % (DIR['specifics'], assembly_id, bill_id) page = utils.read_webpage(fn) table = utils.get_elems(page, X['spec_table'])[1] timeline = page.xpath(X['spec_timeline'])[0] title = page.xpath(X['spec_title'])[0].strip().replace('"', '') status_detail = ' '.join(page.xpath(X['spec_status'])).strip() statuses = filter(None,\ (s.strip() for s in\ ' '.join(\ s for s in timeline.xpath(X['spec_timeline_statuses'])\ if not type(s)==lxml.etree._Element)\ .split('\n'))) status_infos = [filter(None, i.split('*'))\ for i in timeline.xpath(X['spec_timeline_status_infos'])] row_titles = [' '.join(e.xpath('td/text()')).strip()\ for i, e in enumerate(table.xpath('tbody/tr')) if i%4==0] elem_row_contents = [e.xpath('td[@class="text6"]')[0]\ for i, e in enumerate(table.xpath('tbody/tr')) if i%4==1] status_dict = {} for i, r in enumerate(elem_row_contents): if row_titles[i] != '부가정보': status_dict[row_titles[i]] = extract_row_contents(r) else: t = r.xpath('span[@class="text8"]/text()') c = filter(None, (t.strip() for t in r.xpath('text()'))) status_dict[row_titles[i]] = dict(zip(t, c)) headers = [ 'assembly_id', 'bill_id', 'title', 'status_detail', 'statuses', 'status_infos', 'status_dict' ] specifics = [ assembly_id, bill_id, title, status_detail, statuses, status_infos, status_dict ] return dict(zip(headers, specifics))
def extract_specifics(assembly_id, bill_id, meta): def extract_file_links(c): url = c.xpath('descendant::a/@href') i, node = 0, [] elem_node = c.xpath('descendant::node()') for j, n in enumerate(elem_node): if type(n) == lxml.etree._Element: if n.tag == 'br': node.append(elem_node[i:j]) i = j links = dict() for n in node: tmp = [] for m in n: if type(m) == lxml.etree._ElementUnicodeResult: desc = m.strip() links[desc] = tmp tmp = [] elif type(m) == lxml.etree._Element and m.tag not in [ 'img', 'br' ]: tmp.append(m.xpath('@href')[0]) else: pass return links def extract_meeting_num(c): s = c.xpath('descendant::text()')[0] m = re.search(ur'제(.*)대.*제(.*)회', s) return [int(e) for e in m.groups()] def status_info(es, et, status_en): subjects = es.xpath('text()')[0] headers = [t[1] for t in utils.get_elem_texts(et, 'td')] elem_contents = [ c for c in es.xpath(X['timeline']['%s_contents' % status_en]) if type(c) == lxml.etree._Element ] elem_rows = [ec.xpath('td') for ec in elem_contents] rows = [] for row in elem_rows: columns = [] for column in row: links = column.xpath('descendant::a') images = column.xpath('descendant::img') if links: columns.append([link.xpath('@href')[0] for link in links]) elif images: parts = re.sub(r'.*\((.*)\)', r'\g<1>',\ images[0].xpath('@onclick')[0])\ .replace(' ', '').replace('\'','')\ .split(',') if parts[1] > 208: url = '%sdata2/%s/pdf/%s' % (parts[0], parts[1], parts[2]) else: url = '%sdata1/%s/%s' % (parts[0], parts[1], parts[2]) columns.append(url) else: columns.append( column.xpath('descendant::text()')[1].strip()) rows.append(dict(zip(headers, columns))) return rows def extract_extra_info(meta, c): extra_infos = dict() current_category = None for node in r: if node.tag == 'span' and node.get('class') == 'text11': current_category = node.text.strip() current_category = '대안반영폐기 의안목록' if current_category.startswith( '대안반영폐기 의안목록') else current_category continue if current_category == None: continue extra_infos[current_category] = extra_infos[ current_category] if extra_infos.has_key( current_category) else [] content = None if current_category == '비고': content = extract_remark(node) elif current_category == '대안': content = extract_bill_id_from_link(meta, node) elif current_category == '대안반영폐기 의안목록': content = extract_bill_id_from_link(meta, node) else: content = lxml.html.tostring(node) if content: extra_infos[current_category].append(content) return extra_infos def extract_remark(c): try: if c.tag == 'br': return c.tail.strip() else: return c.text.strip() except AttributeError: return None def extract_bill_id_from_link(meta, c): # Assume this is <a> tag href = c.get('href') match = re.match('/bill/jsp/BillDetail.jsp\?bill_id=(.*)', href) if match: return meta.query( 'link_id == @match.group(1)')['bill_id'].values[0] return None fn = '%s/%d/%s.html' % (DIR['specifics'], assembly_id, bill_id) page = utils.read_webpage(fn) table = utils.get_elems(page, X['spec_table'])[1] timeline = page.xpath(X['spec_timeline'])[0] title = page.xpath(X['spec_title'])[0].strip().replace('"', '') status_detail = ' '.join(page.xpath(X['spec_status'])).strip() statuses = filter(None,\ (s.strip() for s in\ ' '.join(\ s for s in timeline.xpath(X['spec_timeline_statuses'])\ if not type(s)==lxml.etree._Element)\ .split('\n'))) status_infos = [filter(None, i.split('*'))\ for i in timeline.xpath(X['spec_timeline_status_infos'])] row_titles = [' '.join(e.xpath('td/text()')).strip()\ for i, e in enumerate(table.xpath('tbody/tr')) if i%4==0] elem_row_contents = [e.xpath('td[@class="text6"]')[0]\ for i, e in enumerate(table.xpath('tbody/tr')) if i%4==1] status_dict = {} for i, r in enumerate(elem_row_contents): if row_titles[i] != '부가정보': status_dict[row_titles[i]] = extract_row_contents(r) else: status_dict[row_titles[i]] = extract_extra_info(meta, r) headers = [ 'assembly_id', 'bill_id', 'title', 'status_detail', 'statuses', 'status_infos', 'status_dict' ] specifics = [ assembly_id, bill_id, title, status_detail, statuses, status_infos, status_dict ] return dict(zip(headers, specifics))