Exemplos de get em Python, exemplos de Parse.request.get em Python

Exemplo n.º 1

0

Exibir arquivo

def detail_parse(url):
    if url is not None:
        item = {}
        response = request.get(url)
        if response is not None:
            titles = re.findall('\$\("#tith1"\)\.html\("(.*?)"\);', response)
            title = str(titles[1]).replace('  ', '').replace('\r', '').replace(
                '\n', '') if len(titles) == 2 else None
            if title is not None:
                if '招标' in title:
                    item['status'] = '招标公告'
                else:
                    item['status'] = '中标公告'
                item['title'] = title
                item['area_name'] = '重庆'
                item['source'] = 'http://www.cqzb.gov.cn'
                date = re.findall('发布时间： (\d\d\d\d-\d\d-\d\d)', response)
                item['publish_date'] = date[0] if len(date) > 0 else None
                item['detail_url'] = url
                item['create_time'] = datetime.now()
                item['zh_name'] = '重庆市招标投标综合网'
                item['en_name'] = 'Chongqing Bidding Comprehensive Network'
                selector = etree.HTML(response)
                div_html = selector.xpath('//div[@class="ztb_con_exp"]')
                content_html = html.tostring(div_html[0],
                                             encoding='utf-8',
                                             pretty_print=True,
                                             method="html").decode('utf-8')
                item['content_html'] = content_html
                Save.push(item=item, key=url)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: a02_gzzbwSpider.py Projeto: xjl12322/guhaisong

def detail_parse(id):
    if id is not None:
        url = 'http://www.gzzbw.cn/api/trade/{}'.format(id)
        item = {}
        response = request.get(url, header={
            'Accept': '*/*',
        })
        if response is not None:
            res = json.loads(response)
            title = res['Title'] if 'Title' in response else None
            if title is not None:
                item['title'] = str(title).replace('\r', '').replace('\n', '')
                item['area_name'] = '贵州'
                item['source'] = 'http://www.gzzbw.cn'
                item['publish_date'] = res['PublishDate']
                item['detail_url'] = url
                item['create_time'] = datetime.now()
                item['content_html'] = res['Content']
                item['zh_name'] = '贵州省招标投标公共服务平台'
                item['en_name'] = 'Guizhou Tender Public Service Platform'
                html_1 = etree.HTML(res['Content'])
                timezones = res['Timezones']
                status = ''
                for timezone in timezones:
                    if "'Id': {}".format(id) in str(timezone):
                        status = timezone['BTypeName']
                item['status'] = status
                Save.push(item=item, key=id)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: a15_gxgpSpider.py Projeto: xjl12322/guhaisong

def detail_parse(url):
    if url is not None:
        item = {}
        response = request.get(url, type='gb2312')
        if response is not None:
            Html = etree.HTML(response)
            titles = Html.xpath('//div[@class="ctitle ctitle1"]/text()')
            item['title'] = titles[0] if len(titles) != 0 else None
            if item['title'] is not None:
                item['area_name'] = '广西壮族自治区'
                item['source'] = 'http://www.gxgp.gov.cn'
                date = re.findall('发布日期：(\d\d\d\d-\d\d-\d\d)', response)
                item['publish_date'] = date[0] if len(date) != 0 else '未公布'

                item['detail_url'] = url
                item['create_time'] = datetime.now()
                item['zh_name'] = '广西壮族自治区政府采购中心'
                item['en_name'] = 'Guangxi Government Procurement Center'

                div_html = Html.xpath('//div[@class="page_row"]')
                content_html = html.tostring(div_html[0],
                                             encoding='utf-8',
                                             pretty_print=True,
                                             method="html").decode('utf-8')
                item['content_html'] = content_html
                html1 = etree.HTML(content_html)
                if 'ygswz' in url:
                    item['status'] = '预公告'
                elif 'cgxjcg' in url or 'cggkzb' in url or 'cgjz' in url or 'cgjzxcs' in url or 'cgdyly' in url:
                    item['status'] = '采购公告'
                else:
                    item['status'] = '中标公告'

                Save.push(item=item, key=url)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: a05_hebprSpider.py Projeto: xjl12322/guhaisong

def detail_parse(url):
    if url is not None:
        item = {}
        response = request.get(url)
        if response is not None:
            Html = etree.HTML(response)
            titles = Html.xpath('//h1[@class="show-title"]/text()')
            if len(titles) > 0:
                item['title'] = str(titles[0]).strip()
                item['area_name'] = '河北'
                item['source'] = 'http://www.hebpr.cn'
                # date = re.findall('发布日期：(\d\d\d\d-\d\d-\d\d)', response)
                item['publish_date'] = ''
                item['detail_url'] = url
                item['create_time'] = datetime.now()
                item['zh_name'] = '河北省政务服务管理办公室'
                item['en_name'] = 'Hebei Province Public resource'

                div_html = Html.xpath('//div[@class="show-con infoContent"]')
                content_html = html.tostring(div_html[0],
                                             encoding='utf-8',
                                             pretty_print=True,
                                             method="html").decode('utf-8')
                item['content_html'] = content_html
                html1 = etree.HTML(content_html)
                states = Html.xpath(
                    '//div[@class="bread-route"]/a[last()]/text()')
                item['status'] = states[0] if len(states) != 0 else None
                Save.push(item=item, key=url)

Exemplo n.º 5

0

Exibir arquivo

def list_parse():
    while True:
        list_url = save.pop(name=dbname)
        if list_url is not None:
            response = request.get(list_url)
            if response is not None:
                # print(response)
                html = etree.HTML(response)
                urls = html.xpath(
                    '//tr[@height="22"]/td/a[@target="_blank"]/@href')
                # print(urls)
                for url in urls:
                    title = ''.join(
                        html.xpath('//a[@href="{}"]/text()'.format(url)))
                    # print(title)
                    date = ''.join(
                        html.xpath(
                            '//a[@href="{}"]/../following-sibling::td[last()]/text()'
                            .format(url)))
                    # print('//a[@href="{}"]/../following-sibling::td[last()]/text()'.format(url))
                    if title != '' and date != '':
                        item = 'http://www.ccgp.com.cn/ccgp/list' + str(
                            url).replace('./',
                                         '/') + '**' + title + '**' + date
                        detail_parse(item)
        else:
            break

Exemplo n.º 6

0

Exibir arquivo

def detail_parse(url):
    if url is not None:
        item = {}
        response = request.get(url, type='utf-8')
        if response is not None:
            Html = etree.HTML(response)
            titles = Html.xpath('//div[@class="sub-xx-tit"]/text()')
            if len(titles) > 0:
                item['title'] = str(titles[0]).strip()
                item['area_name'] = '深圳'
                item['source'] = 'http://ggzy.sz.gov.cn'
                # date = re.findall('(\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d)', response)
                item['publish_date'] = (''.join(
                    Html.xpath('//div[@class="sub-xx-time"]/text()'))
                                        ).strip('发表时间：')
                item['detail_url'] = url
                item['create_time'] = datetime.now()
                item['zh_name'] = '深圳市公共资源交易平台'
                item['en_name'] = 'Shenzhen Public resource'
                div_html = Html.xpath('//div[@class="sub-warp"]')
                content_html = html.tostring(div_html[0],
                                             encoding='utf-8',
                                             pretty_print=True,
                                             method="html").decode('utf-8')
                item['content_html'] = content_html
                html1 = etree.HTML(content_html)
                states = Html.xpath('//div[@class="weizhi"]/a/text()')
                item['status'] = states[-1] if len(states) != 0 else None
                Save.push(item=item, key=url)

Exemplo n.º 7

0

Exibir arquivo

def detail_parse(url):
    if url is not None:
        item = {}
        response = request.get(url, type='utf-8')
        if response is not None:
            Html = etree.HTML(response)
            titles = Html.xpath('//div[@class="bx_title"]/text()')
            if len(titles) > 0:
                if str(titles[0]).strip() != '':
                    item['title'] = str(titles[0]).strip()
                    item['area_name'] = '吉林'
                    item['source'] = 'http://www.jlszfcg.gov.cn'

                    date = re.findall('发布日期：(\d\d\d\d-\d\d-\d\d)', response)
                    item['publish_date'] = str(
                        date[0]).strip() if len(date) > 0 else None

                    item['detail_url'] = url
                    item['create_time'] = datetime.now()
                    div_html = Html.xpath('//div[@class="con08_b"]')
                    content_html = html.tostring(div_html[0],
                                                 encoding='utf-8',
                                                 pretty_print=True,
                                                 method="html").decode('utf-8')
                    item['content_html'] = content_html
                    html1 = etree.HTML(content_html)
                    states = re.findall('\(.*?\)', item['title'])
                    item['status'] = str(states[-1]).strip('(').strip(
                        ')') if len(states) != 0 else None
                    item['zh_name'] = '吉林省政府采购中心'
                    item['en_name'] = 'Jilin Government procurement center'
                    Save.push(item=item, key=url)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: a09_hbggzySpider.py Projeto: xjl12322/guhaisong

def detail_parse(url):
    if url is not None:

        item = {}
        response = request.get(url)
        if response is not None:
            Html = etree.HTML(response)
            titles = Html.xpath('//h6[@class="news-article-tt"]/text()')
            if len(titles) > 0:
                item['title'] = str(titles[0]).strip()
                item['area_name'] = '湖北'
                item['source'] = 'http://www.hbggzy.cn'
                date = re.findall('发稿时间 ：(\d\d\d\d-\d\d-\d\d)', response)
                item['publish_date'] = str(
                    date[0]).strip() if len(date) > 0 else None

                item['detail_url'] = url
                item['create_time'] = datetime.now()
                item['zh_name'] = '湖北省公共资源交易网'
                item['en_name'] = 'Hubei Province Government Procurement'
                div_html = Html.xpath('//div[@class="ewb-row"]')
                content_html = html.tostring(div_html[0],
                                             encoding='utf-8',
                                             pretty_print=True,
                                             method="html").decode('utf-8')
                item['content_html'] = content_html
                states = Html.xpath(
                    '//div[@class="ewb-route"]/span[last()]/text()')
                item['status'] = states[-1] if len(states) != 0 else None
                Save.push(item=item, key=url)

Exemplo n.º 9

0

Exibir arquivo

Arquivo: a08_gzggzySpider.py Projeto: xjl12322/guhaisong

def detail_parse(url):
    if url is not None:
        item = {}
        publish = url.split('***')[1]
        title = url.split('***')[2]
        url = url.split('***')[0]
        response = request.get(url)
        if response is not None:
            Html = etree.HTML(response)
            # titles = Html.xpath('//p[@align="center"]//text()')
            # if len(titles) > 0:
            item['title'] = title
            item['area_name'] = '广州'
            item['source'] = 'http://www.gzggzy.cn'
            # date = re.findall('(\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d)', response)
            item['publish_date'] = publish
            item['detail_url'] = url
            item['create_time'] = datetime.now()
            item['zh_name'] = '广州公共资源交易中心'
            item['en_name'] = 'Guangzhou City Government Procurement'
            div_html = Html.xpath('//div[@class="xx-text"]')
            content_html = html.tostring(div_html[0], encoding='utf-8', pretty_print=True,
                                         method="html").decode('utf-8')
            item['content_html'] = content_html
            html1 = etree.HTML(content_html)
            if 'channelId=456' in url:
                item['status'] = '采购公告'
            elif 'channelId=448' in url:
                item['status'] = '预公告'
            elif 'channelId=457' in url:
                item['status'] = '更正公告'
            else:
                item['status'] = '结果公告'
            Save.push(item=item, key=url)

Exemplo n.º 10

0

Exibir arquivo

def detail_parse(url):
    item = {}
    if url is None:
        return
    response = request.get(url, type='utf-8')
    if response is None:
        return
    Html = etree.HTML(response)
    titles = Html.xpath('//div[@class="title2"]/text()')
    item['title'] = str(titles[0]).strip() if len(titles) != 0 else None
    if item['title'] is None:
        return
    item['area_name'] = '湖北'
    item['source'] = 'http://www.hubeigp.gov.cn'

    date = Html.xpath('//td[contains(text(),"发布时间")]//text()')
    item['publish_date'] = str(
        date[0]).strip('发布时间：') if len(date) != 0 else '未公布'
    item['detail_url'] = url
    item['create_time'] = datetime.now()
    item['zh_name'] = '湖北省政府采购中心'
    item['en_name'] = 'Hubei Government Procurement Center'
    # 详情解析
    div_html = Html.xpath('//td[@class="h2"]')
    content_html = html.tostring(div_html[0],
                                 encoding='utf-8',
                                 pretty_print=True,
                                 method="html").decode('utf-8')
    item['content_html'] = content_html
    html1 = etree.HTML(content_html)
    # 状态
    states = Html.xpath('//td[contains(text(),"当前位置")]/span/a[last()]/text()')
    item['status'] = str(states[0]).strip() if len(states) != 0 else None
    Save.push(item=item, key=url)

Exemplo n.º 11

0

Exibir arquivo

def detail_parse(url):
    if url is not None:
        item = {}
        response = request.get(url)
        if response is not None:
            Html = etree.HTML(response)
            titles = Html.xpath('//div[@class="detail_t"]/h1/text()')
            if len(titles) == 0:
                titles = Html.xpath('//span[@id="title"]/text()')
            if len(titles) == 0:
                titles = Html.xpath('//p[@style="TEXT-ALIGN: center"]/text()')
            if len(titles) == 0:
                titles = Html.xpath('//p[@style="TEXT-ALIGN: right"]/text()')
            if len(titles) == 0:
                titles = Html.xpath('//span[@style="FONT-SIZE: 20px"]/strong/text()')
            if len(titles) > 0:
                if str(titles[0]).strip() != '':
                    item['title'] = str(titles[0]).strip()
                    item['area_name'] = '长沙'
                    item['source'] = 'http://csggzy.changsha.gov.cn/'

                    date = re.findall('发布日期：(\d\d\d\d-\d\d-\d\d)', response)
                    item['publish_date'] = str(date[0]).strip() if len(date) > 0 else None

                    item['detail_url'] = url
                    item['create_time'] = datetime.now()
                    div_html = Html.xpath('//div[@class="detail"]')
                    content_html = html.tostring(div_html[0], encoding='utf-8', pretty_print=True,
                                                 method="html").decode('utf-8')
                    item['content_html'] = content_html
                    states = Html.xpath('//div[@id="dh"]/a/text()')
                    item['status'] = str(states[-1]).strip() if len(states) != 0 else None
                    item['zh_name'] = '长沙公共资源交易中心'
                    item['en_name'] = 'Changsha Public resource'
                    Save.push(item=item, key=url)

Exemplo n.º 12

0

Exibir arquivo

Arquivo: a06_sjzsxzspjSpider.py Projeto: xjl12322/guhaisong

def detail_parse(url):
    if url is not None:
        item = {}
        response = request.get(url)
        if response is not None:
            Html = etree.HTML(response)
            titles = Html.xpath('//div[@class="mt20 tc f24 fb lp1"]/text()')
            if len(titles) > 0:
                item['title'] = str(titles[0]).strip()
                item['area_name'] = '石家庄'
                item['source'] = 'http://www.sjzsxzspj.gov.cn'
                date = re.findall('(\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d)',
                                  response)
                item['publish_date'] = str(date[0]) if len(date) > 0 else None
                item['detail_url'] = url
                item['create_time'] = datetime.now()
                item['zh_name'] = '石家庄公共资源交易网'
                item['en_name'] = 'Shijiazhuang City Public resource'
                div_html = Html.xpath('//div[@class="neiyeibox fr"]')
                content_html = html.tostring(div_html[0],
                                             encoding='utf-8',
                                             pretty_print=True,
                                             method="html").decode('utf-8')
                item['content_html'] = content_html
                html1 = etree.HTML(content_html)
                if 'zfzbgg' in url:
                    item['status'] = '招标公告'
                elif 'zfzbgga' in url:
                    item['status'] = '中标公告'
                elif 'gzgg' in url:
                    item['status'] = '更正公告'
                else:
                    item['status'] = '单一来源'
                Save.push(item=item, key=url)

Exemplo n.º 13

0

Exibir arquivo

def detail_parse(url):
    if url is not None:

        item = {}
        response = request.get(url)
        if response is not None:
            Html = etree.HTML(response)
            titles = Html.xpath('//tr/td[@align="center"]/font/text()')
            if len(titles) > 0:
                item['title'] = str(titles[0]).strip()
                item['area_name'] = '河南'
                item['source'] = 'http://www.hnggzy.com'

                date = re.findall('信息时间：(\d{4,5}/\d{1,2}/\d{1,2})', response)
                item['publish_date'] = str(
                    date[0]).strip() if len(date) > 0 else None

                item['detail_url'] = url
                item['create_time'] = datetime.now()
                item['zh_name'] = '河南省公共资源交易中心'
                item['en_name'] = 'Henan Province Public resource'
                div_html = Html.xpath('//table[@align="center"]')
                content_html = html.tostring(div_html[0],
                                             encoding='utf-8',
                                             pretty_print=True,
                                             method="html").decode('utf-8')
                item['content_html'] = content_html
                html1 = etree.HTML(content_html)
                states = Html.xpath(
                    '//font[contains(text(),"您现在的位置")]/..//a//text()')
                item['status'] = states[-1] if len(states) != 0 else None
                Save.push(item=item, key=url)

Exemplo n.º 14

0

Exibir arquivo

def detail_parse(url):
    if url is not None:
        item = {}
        title = url.split('**')[-1]
        url = url.split('**')[0]
        response = request.get(url)
        if response is not None:
            Html = etree.HTML(response)
            item['title'] = title
            item['area_name'] = '太原'
            item['source'] = 'http://www.tyzfcg.gov.cn/'
            date = re.findall('发布时间:(\d\d\d\d-\d\d-\d\d)', response)

            item['publish_date'] = date[0] if len(date) != 0 else '未公布'

            item['detail_url'] = url
            item['create_time'] = datetime.now()
            item['zh_name'] = '太原市政府采购中心'
            item['en_name'] = 'Taiyuan Government Procurement Center'

            div_html = Html.xpath('//tr[@class="bk5"]')
            content_html = html.tostring(div_html[0], encoding='utf-8', pretty_print=True,
                                         method="html").decode('utf-8')
            item['content_html'] = content_html
            html1 = etree.HTML(content_html)
            states = re.findall('--(.*?)--正文', response)
            item['status'] = str(states[0]).strip() if len(states) != 0 else None
            Save.push(item=item, key=url)

Exemplo n.º 15

0

Exibir arquivo

Arquivo: a16_nmgzfcgSpider.py Projeto: xjl12322/guhaisong

def list_parse():
    while True:
        list_url = save.pop(name=dbname)
        if list_url is not None:
            response = request.get(list_url)
            if response is not None:
                html = etree.HTML(response)
                urls = html.xpath('//tr/td/li/a/@href')
                for url in urls:
                    detail_parse('http://www.nmgzfcg.gov.cn'+url)
        else:
            break

Exemplo n.º 16

0

Exibir arquivo

def list_parse():
    while True:
        list_url = save.pop(name=dbname)
        if list_url is not None:
            response = request.get(list_url)
            if response is not None:
                html = etree.HTML(response)
                urls = html.xpath('//tr[@class="ewb-trade-tr"]/td/a/@href')
                for url in urls:
                    detail_parse('http://ggzyjy.dl.gov.cn' + url)
        else:
            break

Exemplo n.º 17

0

Exibir arquivo

def list_parse():
    while True:
        list_url = save.pop(name=dbname)
        if list_url is None:
            break
        response = request.get(list_url)
        if response is None:
            return
        html = etree.HTML(response)
        urls = html.xpath('//td[@class="linebg"]/a/@href')
        for url in urls:
            detail_parse('http://www.hubeigp.gov.cn' + url)

Exemplo n.º 18

0

Exibir arquivo

Arquivo: a03_ccgpSpider.py Projeto: xjl12322/guhaisong

def list_parse():
    while True:
        list_url = save.pop(name=dbname)
        if list_url is not None:
            response = request.get(list_url)
            if response is not None:
                html = etree.HTML(response)
                urls = html.xpath('//div[@class="List2 Top8"]//li/a/@href')
                for url in urls:
                    detail_parse(url)
        else:
            break

Exemplo n.º 19

0

Exibir arquivo

Arquivo: a09_hbggzySpider.py Projeto: xjl12322/guhaisong

def list_parse():
    while True:
        list_url = save.pop(name=dbname)
        if list_url is not None:
            response = request.get(list_url)
            if response is not None:
                html = etree.HTML(response)
                urls = html.xpath('//ul[@class="ewb-news-items"]/li/a/@href')
                for url in urls:
                    detail_parse('http://www.hbggzy.cn' + url)
        else:
            break

Exemplo n.º 20

0

Exibir arquivo

Arquivo: a05_hebprSpider.py Projeto: xjl12322/guhaisong

def list_parse():
    while True:
        list_url = save.pop(name=dbname)
        if list_url is not None:
            response = request.get(list_url)
            if response is not None:
                html = etree.HTML(response)
                urls = html.xpath(
                    '//a[@class="frame-con-link clearfix"]/@href')
                for url in urls:
                    detail_parse('http://www.hebpr.cn' + url)
        else:
            break

Exemplo n.º 21

0

Exibir arquivo

Arquivo: a22_chinataxSpider.py Projeto: xjl12322/guhaisong

def list_parse():
    while True:
        list_url = save.pop(name=dbname)
        if list_url is None:
            break
        response = request.get(list_url)
        if response is None:
            return
        html = etree.HTML(response)
        urls = html.xpath('//dd/a[contains(@href,"content.html")]/@href')
        for url in urls:
            detail_parse('http://www.chinatax.gov.cn/' +
                         str(url).replace('../', ''))

Exemplo n.º 22

0

Exibir arquivo

def list_parse():
    while True:
        list_url = save.pop(name=dbname)
        if list_url is not None:
            response = request.get(list_url)
            if response is not None:
                html = etree.HTML(response)
                urls = html.xpath(
                    '//div[@class="ztb_list_right"]/ul/li/a/@href')
                for url in urls:
                    detail_parse('http://www.cqzb.gov.cn/' + url)
        else:
            break

Exemplo n.º 23

0

Exibir arquivo

Arquivo: a02_gzzbwSpider.py Projeto: xjl12322/guhaisong

def list_parse():
    while True:
        list_url = save.pop(name=dbname)
        if list_url is not None:
            response = request.get(list_url, header={
                'Accept': '*/*',
            })
            if response is not None:
                ids = re.findall('"Id":(\d{1,10}),', response)
                for id in ids:
                    detail_parse(id)
        else:
            break

Exemplo n.º 24

0

Exibir arquivo

def list_parse():
    while True:
        list_url = save.pop(name=dbname)
        if list_url is not None:
            response = request.get(list_url)
            if response is not None:
                html = etree.HTML(response)
                if html is not None:
                    urls = html.xpath(
                        '//td[@align="left"]/a[@target="_blank"]/@href')
                    for url in urls:
                        detail_parse('http://www.hnggzy.com' + url)
        else:
            break

Exemplo n.º 25

0

Exibir arquivo

Arquivo: a21_zbtbSpider.py Projeto: xjl12322/guhaisong

def list_parse():
    while True:
        list_url = save.pop(name=dbname)
        if list_url is None:
            break
        response = request.get(list_url)
        if response is None:
            return   # http://zbtb.com.cn/caigou/zhaobiao-c1762379.html
        html = etree.HTML(response)
        urls = html.xpath('//td[@class="td_left"]/a/@href')
        for url in urls:
            area_names = html.xpath('//a[@href="{}"]/../preceding-sibling::td//text()'.format(url))
            area_name = area_names[0] if len(area_names)!=0 else '全国'
            detail_parse(url+'**'+area_name)

Exemplo n.º 26

0

Exibir arquivo

def list_parse():
    while True:
        list_url = save.pop(name=dbname)
        if list_url is None:
            break
        response = request.get(list_url)
        if response is None:
            return
        html = etree.HTML(response)
        urls = html.xpath('//td[@align="left"]/a/@href')
        for url in urls:
            dates = html.xpath('//a[@href="{}"]/../../td[last()]/text()'.format(url))
            date = dates[0] if len(dates)!=0 else None
            detail_parse(url+'**'+date)

Exemplo n.º 27

0

Exibir arquivo

def list_parse():
    while True:
        list_url = save.pop(name=dbname)
        if list_url is not None:
            response = request.get(list_url)
            if response is not None:
                # print(response)
                html = etree.HTML(response)
                urls = html.xpath('//li//a/@href')
                # print(urls)
                for url in urls:
                    # print('http://www.jlszfcg.gov.cn' + url)
                    detail_parse('http://www.jlszfcg.gov.cn' +
                                 str(url).replace('../', '/'))
        else:
            break

Exemplo n.º 28

0

Exibir arquivo

Arquivo: a08_gzggzySpider.py Projeto: xjl12322/guhaisong

def list_parse():
    while True:
        list_url = save.pop(name=dbname)
        if list_url is not None:
            response = request.get(list_url)
            if response is not None:
                html = etree.HTML(response)
                urls = html.xpath('//td/a/@href')
                for url in urls:
                    times = html.xpath('//a[@href="{}"]/../following-sibling::td/text()'.format(url))
                    titles = html.xpath('//a[@href="{}"]/text()'.format(url))
                    if len(titles) != 0:
                        url = 'http://www.gzggzy.cn' + url + '***' + times[0] + '***' + titles[0] if len(
                            times) != 0 else '0000-00-00'
                        detail_parse(url)
        else:
            break

Exemplo n.º 29

0

Exibir arquivo

def list_parse():
    while True:
        list_url = save.pop(name=dbname)
        if list_url is not None:
            response = request.get(list_url)
            if response is not None:
                html = etree.HTML(response)
                urls = html.xpath('//ul[@class="list"]/li/a/@href')
                for url in urls:
                    if 'zbcjgg_zf' in list_url:
                        url = 'http://ggzy.sz.gov.cn/jyxx/zfcgxm/zbcjgg_zf' + str(
                            url).replace('./', '/')
                    else:
                        url = 'http://ggzy.sz.gov.cn/jyxx/zfcgxm/zbgg_zf' + str(
                            url).replace('./', '/')
                    detail_parse(url)
        else:
            break

Exemplo n.º 30

0

Exibir arquivo

def list_parse():
    while True:
        list_url = save.pop(name=dbname)
        if list_url is not None:
            response = request.get(list_url)
            if response is not None:
                html = etree.HTML(response)
                urls = html.xpath('//tr[contains(@class,"pt2")]/@onclick')
                for url in urls:
                    titles = html.xpath('//tr[@onclick="{}"]//a/text()'.format(url))
                    if len(titles) != 0:
                        title = titles[0]
                        detail_parse(
                            'http://www.tyzfcg.gov.cn/view.php?app=bidDetail&id=' + str(url).strip('fDetail(').strip(
                                ')') + '**' + title)

        else:
            break