示例#1
0
文件: batch.py 项目: JH27/crawlers
    def get_sitting_urls(assembly_id, div_id, sessionurl):
        root = get.webpage(get.htmltree(sessionurl))
        js_calls = [parse_js_call(j) for j in root.xpath('.//a/@href')]

        params = match_name_codes(js_calls, filter='mainsearch2', type='sessions')
        nsittings = len(params)
        params['j'] = str(nsittings)

        urls = []
        for i in range(nsittings):
            params['SES_NUM'] = params['SES_NUM%s' % i]
            url = '%s&%s' % (sessionurl, urlencode(params))
            # TODO: generalize me
            url = url.replace('con_search2', 'con_search3')
            urls.append({'session_name': params['SES_NUM'], 'url': url})
        return urls
示例#2
0
    def get_sitting_urls(assembly_id, div_id, sessionurl):
        root = get.webpage(get.htmltree(sessionurl))
        js_calls = [parse_js_call(j) for j in root.xpath('.//a/@href')]

        params = match_name_codes(js_calls,
                                  filter='mainsearch2',
                                  type='sessions')
        nsittings = len(params)
        params['j'] = str(nsittings)

        urls = []
        for i in range(nsittings):
            params['SES_NUM'] = params['SES_NUM%s' % i]
            url = '%s&%s' % (sessionurl, urlencode(params))
            # TODO: generalize me
            url = url.replace('con_search2', 'con_search3')
            urls.append({'session_name': params['SES_NUM'], 'url': url})
        return urls
示例#3
0
文件: batch.py 项目: JH27/crawlers
def get_session_urls(assembly_id, div_id, listurl):
    def searchform(root, num=''):
        return root.xpath('.//form[@name="searchform%s"]/@action' % num)[0]

    root = get.webpage(get.htmltree(listurl))
    js_calls = [parse_js_call(j) for j in root.xpath('.//a/@href')]

    params = match_name_codes(js_calls, filter='mainsearch', type='committees')
    nsessions = len(params)/2
    params['i'] = str(nsessions)
    params['div'] = str(div_id)
    params['DAE_NUM'] = str(assembly_id)

    urls = []
    for i in range(nsessions):
        params['COMM_NAME'] = params['COMM_NAME%s' % i]
        params['COMM_CODE'] = params['COMM_CODE%s' % i]
        urls.append(\
            {'committee': params['COMM_NAME'],
             'url': '%s/content/%s?%s' %\
                    (BASEURL, searchform(root)[:-2], urlencode(params))})
    return urls
示例#4
0
def get_session_urls(assembly_id, div_id, listurl):
    def searchform(root, num=''):
        return root.xpath('.//form[@name="searchform%s"]/@action' % num)[0]

    root = get.webpage(get.htmltree(listurl))
    js_calls = [parse_js_call(j) for j in root.xpath('.//a/@href')]

    params = match_name_codes(js_calls, filter='mainsearch', type='committees')
    nsessions = len(params) / 2
    params['i'] = str(nsessions)
    params['div'] = str(div_id)
    params['DAE_NUM'] = str(assembly_id)

    urls = []
    for i in range(nsessions):
        params['COMM_NAME'] = params['COMM_NAME%s' % i]
        params['COMM_CODE'] = params['COMM_CODE%s' % i]
        urls.append(\
            {'committee': params['COMM_NAME'],
             'url': '%s/content/%s?%s' %\
                    (BASEURL, searchform(root)[:-2], urlencode(params))})
    return urls
示例#5
0
文件: batch.py 项目: JH27/crawlers
 def get_doc_ids(assembly_id, div_id, sittingurl):
     root = get.webpage(get.htmltree(sittingurl))
     js_calls = [parse_js_call(j) for j in root.xpath('.//a/@href')]
     return [{'sitting_name': c[1][0], 'docid': c[1][1]}\
                 for c in js_calls if c[0]=='mainsearch4']
示例#6
0
 def get_doc_ids(assembly_id, div_id, sittingurl):
     root = get.webpage(get.htmltree(sittingurl))
     js_calls = [parse_js_call(j) for j in root.xpath('.//a/@href')]
     return [{'sitting_name': c[1][0], 'docid': c[1][1]}\
                 for c in js_calls if c[0]=='mainsearch4']