Пример #1
0
def fetch_new_bill_ids(assembly_id):
    directory = DIR['meta']
    meta_data = '%s/%d.csv' % (directory, assembly_id)

    lines = list(open(meta_data, 'r'))[1:]
    lines = [line.decode('utf-8') for line in lines]
    existing_ids = set(line.split(',', 1)[0].strip('"') for line in lines)
    last_proposed_date = max(
        line.split('","', 6)[5].strip('"') for line in lines)
    baseurl = BASEURL['list']
    url = '%(baseurl)sPROPOSE_FROM=%(last_proposed_date)s&PAGE_SIZE=100' % locals(
    )

    directory = '%s/%s' % (DIR['list'], assembly_id)
    fn = '%s/tmp.html' % directory

    utils.get_webpage(url, fn)
    p = utils.read_webpage(fn)
    rows = utils.get_elems(p, X['table'])

    new_bill_ids = []
    with open(meta_data, 'a') as f:
        for r in reversed(rows):
            columns = r.xpath(X['columns'])
            if len(columns) == 8:
                p = parse_columns(columns)
                if p[0] not in existing_ids:
                    list_to_file(p, f)
                    new_bill_ids.append(p[0])
    return new_bill_ids
Пример #2
0
def fetch_new_bill_ids(assembly_id):
    directory = DIR['meta']
    meta_data = '%s/%d.csv' % (directory, assembly_id)

    lines = list(open(meta_data, 'r'))[1:]
    lines = [line.decode('utf-8') for line in lines]
    existing_ids = set(line.split(',', 1)[0].strip('"') for line in lines)
    last_proposed_date = max(line.split('","', 6)[5].strip('"') for line in lines)
    baseurl = BASEURL['list']
    page_size = PAGE_SIZE
    url = '%(baseurl)sPROPOSE_FROM=%(last_proposed_date)s&PAGE_SIZE=%(page_size)d' % locals()

    directory = '%s/%s' % (DIR['list'], assembly_id)
    fn = '%s/tmp.html' % directory

    utils.get_webpage(url, fn)
    p = utils.read_webpage(fn)
    rows = utils.get_elems(p, X['table'])

    new_bill_ids = []
    with open(meta_data, 'a') as f:
        for r in reversed(rows):
            columns = r.xpath(X['columns'])
            if len(columns)==8:
                p = parse_columns(columns)
                if p[0] not in existing_ids:
                    list_to_file(p, f)
                    new_bill_ids.append(p[0])
    return new_bill_ids
Пример #3
0
def extract_summaries(assembly_id, bill_id):
    #TODO: 제안이유 & 주요내용 분리하기
    try:
        fn = '%s/%s/%s.html' % (DIR['summaries'], assembly_id, bill_id)
        page = utils.read_webpage(fn)
        summaries = [e.replace('?', '/').strip()\
                for e in utils.get_elems(page, X['summary'])]
        return summaries
    except IOError as e:
        return None
Пример #4
0
def extract_summaries(assembly_id, bill_id):
    #TODO: 제안이유 & 주요내용 분리하기
    try:
        fn = '%s/%s/%s.html' % (DIR['summaries'], assembly_id, bill_id)
        page = utils.read_webpage(fn)
        summaries = [e.replace('?', '/').strip()\
                for e in utils.get_elems(page, X['summary'])]
        return summaries
    except IOError as e:
        return None
Пример #5
0
def extract_proposers(assembly_id, bill_id):
    #TODO: 찬성의원 목록에 의원 이름이 있는 경우가 있는자 확인
    fn = '%s/%s/%s.html' % (DIR['proposers'], assembly_id, bill_id)
    page = utils.read_webpage(fn)
    elems = utils.get_elems(page, X['proposers'])
    if assembly_id < 19:
        return elems
    else:
        key = ['name_kr', 'party', 'name_cn']
        values = [filter(None, re.split('[\(/\)]', e)) for e in elems]
        return [{k: v for k, v in zip(key, value)} for value in values]
Пример #6
0
def extract_proposers(assembly_id, bill_id):
    #TODO: 찬성의원 목록에 의원 이름이 있는 경우가 있는자 확인
    fn = '%s/%s/%s.html' % (DIR['proposers'], assembly_id, bill_id)
    page = utils.read_webpage(fn)
    elems = utils.get_elems(page, X['proposers'])
    if assembly_id < 19:
        return elems
    else:
        key = ['name_kr', 'party', 'name_cn']
        values = [filter(None, re.split('[\(/\)]', e)) for e in elems]
        return [{k: v for k, v in zip(key, value)} for value in values]
Пример #7
0
    def parse_page(page, f, assembly_id):
        fn = "%s/%s/%d.html" % (DIR["list"], assembly_id, page)
        p = utils.read_webpage(fn)
        rows = utils.get_elems(p, X["table"])

        for r in reversed(rows):
            columns = r.xpath(X["columns"])
            if len(columns) == 8:
                p = parse_columns(columns)
                list_to_file(p, f)

        sys.stdout.write("%d\t" % page)
        sys.stdout.flush()
Пример #8
0
    def parse_page(page, f, assembly_id):
        fn = '%s/%s/%d.html' % (DIR['list'], assembly_id, page)
        p = utils.read_webpage(fn)
        rows = utils.get_elems(p, X['table'])

        for r in reversed(rows):
            columns = r.xpath(X['columns'])
            if len(columns) == 8:
                p = parse_columns(columns)
                list_to_file(p, f)

        sys.stdout.write('%d\t' % page)
        sys.stdout.flush()
Пример #9
0
def parse_page(page, f, assembly_id):
    fn = '%s/%s/%d.html' % (DIR['list'], assembly_id, page)
    p = utils.read_webpage(fn)
    rows = utils.get_elems(p, X['table'])

    for r in reversed(rows):
        columns = r.xpath(X['columns'])
        if len(columns)==8:
            p = parse_columns(columns)
            list_to_file(p, f)

    sys.stdout.write('%d\t' % page)
    sys.stdout.flush()
Пример #10
0
def extract_withdrawers(assembly_id, bill_id):
    fn = '%s/%s/%s.html' % (DIR['withdrawers'], assembly_id, bill_id)
    page = utils.read_webpage(fn)
    return utils.get_elems(page, X['withdrawers'])
Пример #11
0
def extract_specifics(assembly_id, bill_id, meta):

    def extract_file_links(c):
        url = c.xpath('descendant::a/@href')
        i, node = 0, []
        elem_node = c.xpath('descendant::node()')
        for j, n in enumerate(elem_node):
            if type(n)==lxml.etree._Element:
                if n.tag=='br':
                    node.append(elem_node[i:j])
                    i = j
        links = dict()
        for n in node:
            tmp = []
            for m in n:
                if type(m)==lxml.etree._ElementUnicodeResult:
                    desc = m.strip()
                    links[desc] = tmp
                    tmp = []

                elif type(m)==lxml.etree._Element and m.tag not in ['img', 'br']:
                    tmp.append(m.xpath('@href')[0])
                else:
                    pass
        return links

    def extract_meeting_num(c):
        s = c.xpath('descendant::text()')[0]
        m = re.search(ur'제(.*)대.*제(.*)회', s)
        return [int(e) for e in m.groups()]

    def status_info(es, et, status_en):
        subjects = es.xpath('text()')[0]
        headers = [t[1] for t in utils.get_elem_texts(et, 'td')]

        elem_contents = [c for c in es.xpath(X['timeline']['%s_contents' % status_en]) if type(c)==lxml.etree._Element]
        elem_rows = [ec.xpath('td') for ec in elem_contents]

        rows = []
        for row in elem_rows:
            columns = []
            for column in row:
                links = column.xpath('descendant::a')
                images = column.xpath('descendant::img')
                if links:
                    columns.append([link.xpath('@href')[0] for link in links])
                elif images:
                    parts = re.sub(r'.*\((.*)\)', r'\g<1>',\
                            images[0].xpath('@onclick')[0])\
                            .replace(' ', '').replace('\'','')\
                            .split(',')
                    if parts[1] > 208:
                        url = '%sdata2/%s/pdf/%s' % (parts[0], parts[1], parts[2])
                    else:
                        url = '%sdata1/%s/%s' % (parts[0], parts[1], parts[2])
                    columns.append(url)
                else:
                    columns.append(column.xpath('descendant::text()')[1].strip())
            rows.append(dict(zip(headers, columns)))
        return rows

    fn          = '%s/%d/%s.html' % (DIR['specifics'], assembly_id, bill_id)
    page        = utils.read_webpage(fn)
    table       = utils.get_elems(page, X['spec_table'])[1]
    timeline    = page.xpath(X['spec_timeline'])[0]

    title         = page.xpath(X['spec_title'])[0].strip().replace('"','')
    status_detail = ' '.join(page.xpath(X['spec_status'])).strip()
    statuses      = filter(None,\
                    (s.strip() for s in\
                    ' '.join(\
                    s for s in timeline.xpath(X['spec_timeline_statuses'])\
                    if not type(s)==lxml.etree._Element)\
                    .split('\n')))
    status_infos  = [filter(None, i.split('*'))\
                    for i in timeline.xpath(X['spec_timeline_status_infos'])]
    row_titles = [' '.join(e.xpath('td/text()')).strip()\
            for i, e in enumerate(table.xpath('tbody/tr')) if i%4==0]
    elem_row_contents = [e.xpath('td[@class="text6"]')[0]\
            for i, e in enumerate(table.xpath('tbody/tr')) if i%4==1]
    status_dict   = {}

    for i, r in enumerate(elem_row_contents):
        if row_titles[i]!='부가정보':
            status_dict[row_titles[i]] = extract_row_contents(r)
        else:
            t = r.xpath('span[@class="text8"]/text()')
            c = filter(None, (t.strip() for t in r.xpath('text()')))
            status_dict[row_titles[i]] = dict(zip(t, c))

    headers = ['assembly_id', 'bill_id', 'title', 'status_detail', 'statuses', 'status_infos', 'status_dict']
    specifics = [assembly_id, bill_id, title, status_detail, statuses, status_infos, status_dict]

    return dict(zip(headers, specifics))
Пример #12
0
def extract_withdrawers(assembly_id, bill_id):
    fn = '%s/%s/%s.html' % (DIR['withdrawers'], assembly_id, bill_id)
    page = utils.read_webpage(fn)
    return utils.get_elems(page, X['withdrawers'])
Пример #13
0
def extract_specifics(assembly_id, bill_id, meta):
    def extract_file_links(c):
        url = c.xpath('descendant::a/@href')
        i, node = 0, []
        elem_node = c.xpath('descendant::node()')
        for j, n in enumerate(elem_node):
            if type(n) == lxml.etree._Element:
                if n.tag == 'br':
                    node.append(elem_node[i:j])
                    i = j
        links = dict()
        for n in node:
            tmp = []
            for m in n:
                if type(m) == lxml.etree._ElementUnicodeResult:
                    desc = m.strip()
                    links[desc] = tmp
                    tmp = []

                elif type(m) == lxml.etree._Element and m.tag not in [
                        'img', 'br'
                ]:
                    tmp.append(m.xpath('@href')[0])
                else:
                    pass
        return links

    def extract_meeting_num(c):
        s = c.xpath('descendant::text()')[0]
        m = re.search(ur'제(.*)대.*제(.*)회', s)
        return [int(e) for e in m.groups()]

    def status_info(es, et, status_en):
        subjects = es.xpath('text()')[0]
        headers = [t[1] for t in utils.get_elem_texts(et, 'td')]

        elem_contents = [
            c for c in es.xpath(X['timeline']['%s_contents' % status_en])
            if type(c) == lxml.etree._Element
        ]
        elem_rows = [ec.xpath('td') for ec in elem_contents]

        rows = []
        for row in elem_rows:
            columns = []
            for column in row:
                links = column.xpath('descendant::a')
                images = column.xpath('descendant::img')
                if links:
                    columns.append([link.xpath('@href')[0] for link in links])
                elif images:
                    parts = re.sub(r'.*\((.*)\)', r'\g<1>',\
                            images[0].xpath('@onclick')[0])\
                            .replace(' ', '').replace('\'','')\
                            .split(',')
                    if parts[1] > 208:
                        url = '%sdata2/%s/pdf/%s' % (parts[0], parts[1],
                                                     parts[2])
                    else:
                        url = '%sdata1/%s/%s' % (parts[0], parts[1], parts[2])
                    columns.append(url)
                else:
                    columns.append(
                        column.xpath('descendant::text()')[1].strip())
            rows.append(dict(zip(headers, columns)))
        return rows

    def extract_extra_info(meta, c):
        extra_infos = dict()
        current_category = None
        for node in r:
            if node.tag == 'span' and node.get('class') == 'text11':
                current_category = node.text.strip()
                current_category = '대안반영폐기 의안목록' if current_category.startswith(
                    '대안반영폐기 의안목록') else current_category
                continue

            if current_category == None:
                continue

            extra_infos[current_category] = extra_infos[
                current_category] if extra_infos.has_key(
                    current_category) else []
            content = None
            if current_category == '비고':
                content = extract_remark(node)
            elif current_category == '대안':
                content = extract_bill_id_from_link(meta, node)
            elif current_category == '대안반영폐기 의안목록':
                content = extract_bill_id_from_link(meta, node)
            else:
                content = lxml.html.tostring(node)

            if content:
                extra_infos[current_category].append(content)
        return extra_infos

    def extract_remark(c):
        try:
            if c.tag == 'br':
                return c.tail.strip()
            else:
                return c.text.strip()
        except AttributeError:
            return None

    def extract_bill_id_from_link(meta, c):
        # Assume this is <a> tag
        href = c.get('href')
        match = re.match('/bill/jsp/BillDetail.jsp\?bill_id=(.*)', href)
        if match:
            return meta.query(
                'link_id == @match.group(1)')['bill_id'].values[0]
        return None

    fn = '%s/%d/%s.html' % (DIR['specifics'], assembly_id, bill_id)
    page = utils.read_webpage(fn)
    table = utils.get_elems(page, X['spec_table'])[1]
    timeline = page.xpath(X['spec_timeline'])[0]

    title = page.xpath(X['spec_title'])[0].strip().replace('"', '')
    status_detail = ' '.join(page.xpath(X['spec_status'])).strip()
    statuses      = filter(None,\
                    (s.strip() for s in\
                    ' '.join(\
                    s for s in timeline.xpath(X['spec_timeline_statuses'])\
                    if not type(s)==lxml.etree._Element)\
                    .split('\n')))
    status_infos  = [filter(None, i.split('*'))\
                    for i in timeline.xpath(X['spec_timeline_status_infos'])]
    row_titles = [' '.join(e.xpath('td/text()')).strip()\
            for i, e in enumerate(table.xpath('tbody/tr')) if i%4==0]
    elem_row_contents = [e.xpath('td[@class="text6"]')[0]\
            for i, e in enumerate(table.xpath('tbody/tr')) if i%4==1]
    status_dict = {}

    for i, r in enumerate(elem_row_contents):
        if row_titles[i] != '부가정보':
            status_dict[row_titles[i]] = extract_row_contents(r)
        else:
            status_dict[row_titles[i]] = extract_extra_info(meta, r)

    headers = [
        'assembly_id', 'bill_id', 'title', 'status_detail', 'statuses',
        'status_infos', 'status_dict'
    ]
    specifics = [
        assembly_id, bill_id, title, status_detail, statuses, status_infos,
        status_dict
    ]

    return dict(zip(headers, specifics))
Пример #14
0
def extract_specifics(assembly_id, bill_id, meta):

    def extract_file_links(c):
        url = c.xpath('descendant::a/@href')
        i, node = 0, []
        elem_node = c.xpath('descendant::node()')
        for j, n in enumerate(elem_node):
            if type(n)==lxml.etree._Element:
                if n.tag=='br':
                    node.append(elem_node[i:j])
                    i = j
        links = dict()
        for n in node:
            tmp = []
            for m in n:
                if type(m)==lxml.etree._ElementUnicodeResult:
                    desc = m.strip()
                    links[desc] = tmp
                    tmp = []

                elif type(m)==lxml.etree._Element and m.tag not in ['img', 'br']:
                    tmp.append(m.xpath('@href')[0])
                else:
                    pass
        return links

    def extract_meeting_num(c):
        s = c.xpath('descendant::text()')[0]
        m = re.search(ur'제(.*)대.*제(.*)회', s)
        return [int(e) for e in m.groups()]

    def status_info(es, et, status_en):
        subjects = es.xpath('text()')[0]
        headers = [t[1] for t in utils.get_elem_texts(et, 'td')]

        elem_contents = [c for c in es.xpath(X['timeline']['%s_contents' % status_en]) if type(c)==lxml.etree._Element]
        elem_rows = [ec.xpath('td') for ec in elem_contents]

        rows = []
        for row in elem_rows:
            columns = []
            for column in row:
                links = column.xpath('descendant::a')
                images = column.xpath('descendant::img')
                if links:
                    columns.append([link.xpath('@href')[0] for link in links])
                elif images:
                    parts = re.sub(r'.*\((.*)\)', r'\g<1>',\
                            images[0].xpath('@onclick')[0])\
                            .replace(' ', '').replace('\'','')\
                            .split(',')
                    if parts[1] > 208:
                        url = '%sdata2/%s/pdf/%s' % (parts[0], parts[1], parts[2])
                    else:
                        url = '%sdata1/%s/%s' % (parts[0], parts[1], parts[2])
                    columns.append(url)
                else:
                    columns.append(column.xpath('descendant::text()')[1].strip())
            rows.append(dict(zip(headers, columns)))
        return rows

    def extract_extra_info(meta, c):
        extra_infos = dict()
        current_category = None
        for node in r:
            if node.tag == 'span' and node.get('class') == 'text11':
                current_category = node.text.strip()
                current_category = '대안반영폐기 의안목록' if current_category.startswith('대안반영폐기 의안목록') else current_category
                continue

            if current_category == None:
                continue

            extra_infos[current_category] = extra_infos[current_category] if extra_infos.has_key(current_category) else []
            content = None
            if current_category == '비고':
                content = extract_remark(node)
            elif current_category == '대안':
                content = extract_bill_id_from_link(meta, node)
            elif current_category == '대안반영폐기 의안목록':
                content = extract_bill_id_from_link(meta, node)
            else:
                content = lxml.html.tostring(node)

            if content:
                extra_infos[current_category].append(content)
        return extra_infos

    def extract_remark(c):
        try:
            if c.tag == 'br':
                return c.tail.strip()
            else:
                return c.text.strip()
        except AttributeError:
            return None

    def extract_bill_id_from_link(meta, c):
        # Assume this is <a> tag
        href = c.get('href')
        match = re.match('/bill/jsp/BillDetail.jsp\?bill_id=(.*)', href)
        if match:
            return meta.query('link_id == @match.group(1)')['bill_id'].values[0]
        return None

    fn          = '%s/%d/%s.html' % (DIR['specifics'], assembly_id, bill_id)
    page        = utils.read_webpage(fn)
    table       = utils.get_elems(page, X['spec_table'])[1]
    timeline    = page.xpath(X['spec_timeline'])[0]

    title         = page.xpath(X['spec_title'])[0].strip().replace('"','')
    status_detail = ' '.join(page.xpath(X['spec_status'])).strip()
    statuses      = filter(None,\
                    (s.strip() for s in\
                    ' '.join(\
                    s for s in timeline.xpath(X['spec_timeline_statuses'])\
                    if not type(s)==lxml.etree._Element)\
                    .split('\n')))
    status_infos  = [filter(None, i.split('*'))\
                    for i in timeline.xpath(X['spec_timeline_status_infos'])]
    row_titles = [' '.join(e.xpath('td/text()')).strip()\
            for i, e in enumerate(table.xpath('tbody/tr')) if i%4==0]
    elem_row_contents = [e.xpath('td[@class="text6"]')[0]\
            for i, e in enumerate(table.xpath('tbody/tr')) if i%4==1]
    status_dict   = {}

    for i, r in enumerate(elem_row_contents):
        if row_titles[i]!='부가정보':
            status_dict[row_titles[i]] = extract_row_contents(r)
        else:
            status_dict[row_titles[i]] = extract_extra_info(meta, r)

    headers = ['assembly_id', 'bill_id', 'title', 'status_detail', 'statuses', 'status_infos', 'status_dict']
    specifics = [assembly_id, bill_id, title, status_detail, statuses, status_infos, status_dict]

    return dict(zip(headers, specifics))
Пример #15
0
def extract_specifics(assembly_id, bill_id, meta):
    def extract_file_links(c):
        url = c.xpath('descendant::a/@href')
        i, node = 0, []
        elem_node = c.xpath('descendant::node()')
        for j, n in enumerate(elem_node):
            if type(n) == lxml.etree._Element:
                if n.tag == 'br':
                    node.append(elem_node[i:j])
                    i = j
        links = dict()
        for n in node:
            tmp = []
            for m in n:
                if type(m) == lxml.etree._ElementUnicodeResult:
                    desc = m.strip()
                    links[desc] = tmp
                    tmp = []

                elif type(m) == lxml.etree._Element and m.tag not in [
                        'img', 'br'
                ]:
                    tmp.append(m.xpath('@href')[0])
                else:
                    pass
        return links

    def extract_meeting_num(c):
        s = c.xpath('descendant::text()')[0]
        m = re.search(ur'제(.*)대.*제(.*)회', s)
        return [int(e) for e in m.groups()]

    def status_info(es, et, status_en):
        subjects = es.xpath('text()')[0]
        headers = [t[1] for t in utils.get_elem_texts(et, 'td')]

        elem_contents = [
            c for c in es.xpath(X['timeline']['%s_contents' % status_en])
            if type(c) == lxml.etree._Element
        ]
        elem_rows = [ec.xpath('td') for ec in elem_contents]

        rows = []
        for row in elem_rows:
            columns = []
            for column in row:
                links = column.xpath('descendant::a')
                images = column.xpath('descendant::img')
                if links:
                    columns.append([link.xpath('@href')[0] for link in links])
                elif images:
                    parts = re.sub(r'.*\((.*)\)', r'\g<1>',\
                            images[0].xpath('@onclick')[0])\
                            .replace(' ', '').replace('\'','')\
                            .split(',')
                    if parts[1] > 208:
                        url = '%sdata2/%s/pdf/%s' % (parts[0], parts[1],
                                                     parts[2])
                    else:
                        url = '%sdata1/%s/%s' % (parts[0], parts[1], parts[2])
                    columns.append(url)
                else:
                    columns.append(
                        column.xpath('descendant::text()')[1].strip())
            rows.append(dict(zip(headers, columns)))
        return rows

    fn = '%s/%d/%s.html' % (DIR['specifics'], assembly_id, bill_id)
    page = utils.read_webpage(fn)
    table = utils.get_elems(page, X['spec_table'])[1]
    timeline = page.xpath(X['spec_timeline'])[0]

    title = page.xpath(X['spec_title'])[0].strip().replace('"', '')
    status_detail = ' '.join(page.xpath(X['spec_status'])).strip()
    statuses      = filter(None,\
                    (s.strip() for s in\
                    ' '.join(\
                    s for s in timeline.xpath(X['spec_timeline_statuses'])\
                    if not type(s)==lxml.etree._Element)\
                    .split('\n')))
    status_infos  = [filter(None, i.split('*'))\
                    for i in timeline.xpath(X['spec_timeline_status_infos'])]
    row_titles = [' '.join(e.xpath('td/text()')).strip()\
            for i, e in enumerate(table.xpath('tbody/tr')) if i%4==0]
    elem_row_contents = [e.xpath('td[@class="text6"]')[0]\
            for i, e in enumerate(table.xpath('tbody/tr')) if i%4==1]
    status_dict = {}

    for i, r in enumerate(elem_row_contents):
        if row_titles[i] != '부가정보':
            status_dict[row_titles[i]] = extract_row_contents(r)
        else:
            t = r.xpath('span[@class="text8"]/text()')
            c = filter(None, (t.strip() for t in r.xpath('text()')))
            status_dict[row_titles[i]] = dict(zip(t, c))

    headers = [
        'assembly_id', 'bill_id', 'title', 'status_detail', 'statuses',
        'status_infos', 'status_dict'
    ]
    specifics = [
        assembly_id, bill_id, title, status_detail, statuses, status_infos,
        status_dict
    ]

    return dict(zip(headers, specifics))
Пример #16
0
def extract_proposers(assembly_id, bill_id):
    #TODO: 찬성의원 목록에 의원 이름이 있는 경우가 있는자 확인
    fn = '%s/%s/%s.html' % (DIR['proposers'], assembly_id, bill_id)
    page = utils.read_webpage(fn)
    return utils.get_elems(page, X['proposers'])