def detail_parse(url): if url is not None: item = {} response = request.get(url) if response is not None: titles = re.findall('\$\("#tith1"\)\.html\("(.*?)"\);', response) title = str(titles[1]).replace(' ', '').replace('\r', '').replace( '\n', '') if len(titles) == 2 else None if title is not None: if '招标' in title: item['status'] = '招标公告' else: item['status'] = '中标公告' item['title'] = title item['area_name'] = '重庆' item['source'] = 'http://www.cqzb.gov.cn' date = re.findall('发布时间: (\d\d\d\d-\d\d-\d\d)', response) item['publish_date'] = date[0] if len(date) > 0 else None item['detail_url'] = url item['create_time'] = datetime.now() item['zh_name'] = '重庆市招标投标综合网' item['en_name'] = 'Chongqing Bidding Comprehensive Network' selector = etree.HTML(response) div_html = selector.xpath('//div[@class="ztb_con_exp"]') content_html = html.tostring(div_html[0], encoding='utf-8', pretty_print=True, method="html").decode('utf-8') item['content_html'] = content_html Save.push(item=item, key=url)
def detail_parse(id): if id is not None: url = 'http://www.gzzbw.cn/api/trade/{}'.format(id) item = {} response = request.get(url, header={ 'Accept': '*/*', }) if response is not None: res = json.loads(response) title = res['Title'] if 'Title' in response else None if title is not None: item['title'] = str(title).replace('\r', '').replace('\n', '') item['area_name'] = '贵州' item['source'] = 'http://www.gzzbw.cn' item['publish_date'] = res['PublishDate'] item['detail_url'] = url item['create_time'] = datetime.now() item['content_html'] = res['Content'] item['zh_name'] = '贵州省招标投标公共服务平台' item['en_name'] = 'Guizhou Tender Public Service Platform' html_1 = etree.HTML(res['Content']) timezones = res['Timezones'] status = '' for timezone in timezones: if "'Id': {}".format(id) in str(timezone): status = timezone['BTypeName'] item['status'] = status Save.push(item=item, key=id)
def detail_parse(url): if url is not None: item = {} response = request.get(url, type='gb2312') if response is not None: Html = etree.HTML(response) titles = Html.xpath('//div[@class="ctitle ctitle1"]/text()') item['title'] = titles[0] if len(titles) != 0 else None if item['title'] is not None: item['area_name'] = '广西壮族自治区' item['source'] = 'http://www.gxgp.gov.cn' date = re.findall('发布日期:(\d\d\d\d-\d\d-\d\d)', response) item['publish_date'] = date[0] if len(date) != 0 else '未公布' item['detail_url'] = url item['create_time'] = datetime.now() item['zh_name'] = '广西壮族自治区政府采购中心' item['en_name'] = 'Guangxi Government Procurement Center' div_html = Html.xpath('//div[@class="page_row"]') content_html = html.tostring(div_html[0], encoding='utf-8', pretty_print=True, method="html").decode('utf-8') item['content_html'] = content_html html1 = etree.HTML(content_html) if 'ygswz' in url: item['status'] = '预公告' elif 'cgxjcg' in url or 'cggkzb' in url or 'cgjz' in url or 'cgjzxcs' in url or 'cgdyly' in url: item['status'] = '采购公告' else: item['status'] = '中标公告' Save.push(item=item, key=url)
def detail_parse(url): if url is not None: item = {} response = request.get(url) if response is not None: Html = etree.HTML(response) titles = Html.xpath('//h1[@class="show-title"]/text()') if len(titles) > 0: item['title'] = str(titles[0]).strip() item['area_name'] = '河北' item['source'] = 'http://www.hebpr.cn' # date = re.findall('发布日期:(\d\d\d\d-\d\d-\d\d)', response) item['publish_date'] = '' item['detail_url'] = url item['create_time'] = datetime.now() item['zh_name'] = '河北省政务服务管理办公室' item['en_name'] = 'Hebei Province Public resource' div_html = Html.xpath('//div[@class="show-con infoContent"]') content_html = html.tostring(div_html[0], encoding='utf-8', pretty_print=True, method="html").decode('utf-8') item['content_html'] = content_html html1 = etree.HTML(content_html) states = Html.xpath( '//div[@class="bread-route"]/a[last()]/text()') item['status'] = states[0] if len(states) != 0 else None Save.push(item=item, key=url)
def list_parse(): while True: list_url = save.pop(name=dbname) if list_url is not None: response = request.get(list_url) if response is not None: # print(response) html = etree.HTML(response) urls = html.xpath( '//tr[@height="22"]/td/a[@target="_blank"]/@href') # print(urls) for url in urls: title = ''.join( html.xpath('//a[@href="{}"]/text()'.format(url))) # print(title) date = ''.join( html.xpath( '//a[@href="{}"]/../following-sibling::td[last()]/text()' .format(url))) # print('//a[@href="{}"]/../following-sibling::td[last()]/text()'.format(url)) if title != '' and date != '': item = 'http://www.ccgp.com.cn/ccgp/list' + str( url).replace('./', '/') + '**' + title + '**' + date detail_parse(item) else: break
def detail_parse(url): if url is not None: item = {} response = request.get(url, type='utf-8') if response is not None: Html = etree.HTML(response) titles = Html.xpath('//div[@class="sub-xx-tit"]/text()') if len(titles) > 0: item['title'] = str(titles[0]).strip() item['area_name'] = '深圳' item['source'] = 'http://ggzy.sz.gov.cn' # date = re.findall('(\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d)', response) item['publish_date'] = (''.join( Html.xpath('//div[@class="sub-xx-time"]/text()')) ).strip('发表时间:') item['detail_url'] = url item['create_time'] = datetime.now() item['zh_name'] = '深圳市公共资源交易平台' item['en_name'] = 'Shenzhen Public resource' div_html = Html.xpath('//div[@class="sub-warp"]') content_html = html.tostring(div_html[0], encoding='utf-8', pretty_print=True, method="html").decode('utf-8') item['content_html'] = content_html html1 = etree.HTML(content_html) states = Html.xpath('//div[@class="weizhi"]/a/text()') item['status'] = states[-1] if len(states) != 0 else None Save.push(item=item, key=url)
def detail_parse(url): if url is not None: item = {} response = request.get(url, type='utf-8') if response is not None: Html = etree.HTML(response) titles = Html.xpath('//div[@class="bx_title"]/text()') if len(titles) > 0: if str(titles[0]).strip() != '': item['title'] = str(titles[0]).strip() item['area_name'] = '吉林' item['source'] = 'http://www.jlszfcg.gov.cn' date = re.findall('发布日期:(\d\d\d\d-\d\d-\d\d)', response) item['publish_date'] = str( date[0]).strip() if len(date) > 0 else None item['detail_url'] = url item['create_time'] = datetime.now() div_html = Html.xpath('//div[@class="con08_b"]') content_html = html.tostring(div_html[0], encoding='utf-8', pretty_print=True, method="html").decode('utf-8') item['content_html'] = content_html html1 = etree.HTML(content_html) states = re.findall('\(.*?\)', item['title']) item['status'] = str(states[-1]).strip('(').strip( ')') if len(states) != 0 else None item['zh_name'] = '吉林省政府采购中心' item['en_name'] = 'Jilin Government procurement center' Save.push(item=item, key=url)
def detail_parse(url): if url is not None: item = {} response = request.get(url) if response is not None: Html = etree.HTML(response) titles = Html.xpath('//h6[@class="news-article-tt"]/text()') if len(titles) > 0: item['title'] = str(titles[0]).strip() item['area_name'] = '湖北' item['source'] = 'http://www.hbggzy.cn' date = re.findall('发稿时间 :(\d\d\d\d-\d\d-\d\d)', response) item['publish_date'] = str( date[0]).strip() if len(date) > 0 else None item['detail_url'] = url item['create_time'] = datetime.now() item['zh_name'] = '湖北省公共资源交易网' item['en_name'] = 'Hubei Province Government Procurement' div_html = Html.xpath('//div[@class="ewb-row"]') content_html = html.tostring(div_html[0], encoding='utf-8', pretty_print=True, method="html").decode('utf-8') item['content_html'] = content_html states = Html.xpath( '//div[@class="ewb-route"]/span[last()]/text()') item['status'] = states[-1] if len(states) != 0 else None Save.push(item=item, key=url)
def detail_parse(url): if url is not None: item = {} publish = url.split('***')[1] title = url.split('***')[2] url = url.split('***')[0] response = request.get(url) if response is not None: Html = etree.HTML(response) # titles = Html.xpath('//p[@align="center"]//text()') # if len(titles) > 0: item['title'] = title item['area_name'] = '广州' item['source'] = 'http://www.gzggzy.cn' # date = re.findall('(\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d)', response) item['publish_date'] = publish item['detail_url'] = url item['create_time'] = datetime.now() item['zh_name'] = '广州公共资源交易中心' item['en_name'] = 'Guangzhou City Government Procurement' div_html = Html.xpath('//div[@class="xx-text"]') content_html = html.tostring(div_html[0], encoding='utf-8', pretty_print=True, method="html").decode('utf-8') item['content_html'] = content_html html1 = etree.HTML(content_html) if 'channelId=456' in url: item['status'] = '采购公告' elif 'channelId=448' in url: item['status'] = '预公告' elif 'channelId=457' in url: item['status'] = '更正公告' else: item['status'] = '结果公告' Save.push(item=item, key=url)
def detail_parse(url): item = {} if url is None: return response = request.get(url, type='utf-8') if response is None: return Html = etree.HTML(response) titles = Html.xpath('//div[@class="title2"]/text()') item['title'] = str(titles[0]).strip() if len(titles) != 0 else None if item['title'] is None: return item['area_name'] = '湖北' item['source'] = 'http://www.hubeigp.gov.cn' date = Html.xpath('//td[contains(text(),"发布时间")]//text()') item['publish_date'] = str( date[0]).strip('发布时间:') if len(date) != 0 else '未公布' item['detail_url'] = url item['create_time'] = datetime.now() item['zh_name'] = '湖北省政府采购中心' item['en_name'] = 'Hubei Government Procurement Center' # 详情解析 div_html = Html.xpath('//td[@class="h2"]') content_html = html.tostring(div_html[0], encoding='utf-8', pretty_print=True, method="html").decode('utf-8') item['content_html'] = content_html html1 = etree.HTML(content_html) # 状态 states = Html.xpath('//td[contains(text(),"当前位置")]/span/a[last()]/text()') item['status'] = str(states[0]).strip() if len(states) != 0 else None Save.push(item=item, key=url)
def detail_parse(url): if url is not None: item = {} response = request.get(url) if response is not None: Html = etree.HTML(response) titles = Html.xpath('//div[@class="detail_t"]/h1/text()') if len(titles) == 0: titles = Html.xpath('//span[@id="title"]/text()') if len(titles) == 0: titles = Html.xpath('//p[@style="TEXT-ALIGN: center"]/text()') if len(titles) == 0: titles = Html.xpath('//p[@style="TEXT-ALIGN: right"]/text()') if len(titles) == 0: titles = Html.xpath('//span[@style="FONT-SIZE: 20px"]/strong/text()') if len(titles) > 0: if str(titles[0]).strip() != '': item['title'] = str(titles[0]).strip() item['area_name'] = '长沙' item['source'] = 'http://csggzy.changsha.gov.cn/' date = re.findall('发布日期:(\d\d\d\d-\d\d-\d\d)', response) item['publish_date'] = str(date[0]).strip() if len(date) > 0 else None item['detail_url'] = url item['create_time'] = datetime.now() div_html = Html.xpath('//div[@class="detail"]') content_html = html.tostring(div_html[0], encoding='utf-8', pretty_print=True, method="html").decode('utf-8') item['content_html'] = content_html states = Html.xpath('//div[@id="dh"]/a/text()') item['status'] = str(states[-1]).strip() if len(states) != 0 else None item['zh_name'] = '长沙公共资源交易中心' item['en_name'] = 'Changsha Public resource' Save.push(item=item, key=url)
def detail_parse(url): if url is not None: item = {} response = request.get(url) if response is not None: Html = etree.HTML(response) titles = Html.xpath('//div[@class="mt20 tc f24 fb lp1"]/text()') if len(titles) > 0: item['title'] = str(titles[0]).strip() item['area_name'] = '石家庄' item['source'] = 'http://www.sjzsxzspj.gov.cn' date = re.findall('(\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d)', response) item['publish_date'] = str(date[0]) if len(date) > 0 else None item['detail_url'] = url item['create_time'] = datetime.now() item['zh_name'] = '石家庄公共资源交易网' item['en_name'] = 'Shijiazhuang City Public resource' div_html = Html.xpath('//div[@class="neiyeibox fr"]') content_html = html.tostring(div_html[0], encoding='utf-8', pretty_print=True, method="html").decode('utf-8') item['content_html'] = content_html html1 = etree.HTML(content_html) if 'zfzbgg' in url: item['status'] = '招标公告' elif 'zfzbgga' in url: item['status'] = '中标公告' elif 'gzgg' in url: item['status'] = '更正公告' else: item['status'] = '单一来源' Save.push(item=item, key=url)
def detail_parse(url): if url is not None: item = {} response = request.get(url) if response is not None: Html = etree.HTML(response) titles = Html.xpath('//tr/td[@align="center"]/font/text()') if len(titles) > 0: item['title'] = str(titles[0]).strip() item['area_name'] = '河南' item['source'] = 'http://www.hnggzy.com' date = re.findall('信息时间:(\d{4,5}/\d{1,2}/\d{1,2})', response) item['publish_date'] = str( date[0]).strip() if len(date) > 0 else None item['detail_url'] = url item['create_time'] = datetime.now() item['zh_name'] = '河南省公共资源交易中心' item['en_name'] = 'Henan Province Public resource' div_html = Html.xpath('//table[@align="center"]') content_html = html.tostring(div_html[0], encoding='utf-8', pretty_print=True, method="html").decode('utf-8') item['content_html'] = content_html html1 = etree.HTML(content_html) states = Html.xpath( '//font[contains(text(),"您现在的位置")]/..//a//text()') item['status'] = states[-1] if len(states) != 0 else None Save.push(item=item, key=url)
def detail_parse(url): if url is not None: item = {} title = url.split('**')[-1] url = url.split('**')[0] response = request.get(url) if response is not None: Html = etree.HTML(response) item['title'] = title item['area_name'] = '太原' item['source'] = 'http://www.tyzfcg.gov.cn/' date = re.findall('发布时间:(\d\d\d\d-\d\d-\d\d)', response) item['publish_date'] = date[0] if len(date) != 0 else '未公布' item['detail_url'] = url item['create_time'] = datetime.now() item['zh_name'] = '太原市政府采购中心' item['en_name'] = 'Taiyuan Government Procurement Center' div_html = Html.xpath('//tr[@class="bk5"]') content_html = html.tostring(div_html[0], encoding='utf-8', pretty_print=True, method="html").decode('utf-8') item['content_html'] = content_html html1 = etree.HTML(content_html) states = re.findall('--(.*?)--正文', response) item['status'] = str(states[0]).strip() if len(states) != 0 else None Save.push(item=item, key=url)
def list_parse(): while True: list_url = save.pop(name=dbname) if list_url is not None: response = request.get(list_url) if response is not None: html = etree.HTML(response) urls = html.xpath('//tr/td/li/a/@href') for url in urls: detail_parse('http://www.nmgzfcg.gov.cn'+url) else: break
def list_parse(): while True: list_url = save.pop(name=dbname) if list_url is not None: response = request.get(list_url) if response is not None: html = etree.HTML(response) urls = html.xpath('//tr[@class="ewb-trade-tr"]/td/a/@href') for url in urls: detail_parse('http://ggzyjy.dl.gov.cn' + url) else: break
def list_parse(): while True: list_url = save.pop(name=dbname) if list_url is None: break response = request.get(list_url) if response is None: return html = etree.HTML(response) urls = html.xpath('//td[@class="linebg"]/a/@href') for url in urls: detail_parse('http://www.hubeigp.gov.cn' + url)
def list_parse(): while True: list_url = save.pop(name=dbname) if list_url is not None: response = request.get(list_url) if response is not None: html = etree.HTML(response) urls = html.xpath('//div[@class="List2 Top8"]//li/a/@href') for url in urls: detail_parse(url) else: break
def list_parse(): while True: list_url = save.pop(name=dbname) if list_url is not None: response = request.get(list_url) if response is not None: html = etree.HTML(response) urls = html.xpath('//ul[@class="ewb-news-items"]/li/a/@href') for url in urls: detail_parse('http://www.hbggzy.cn' + url) else: break
def list_parse(): while True: list_url = save.pop(name=dbname) if list_url is not None: response = request.get(list_url) if response is not None: html = etree.HTML(response) urls = html.xpath( '//a[@class="frame-con-link clearfix"]/@href') for url in urls: detail_parse('http://www.hebpr.cn' + url) else: break
def list_parse(): while True: list_url = save.pop(name=dbname) if list_url is None: break response = request.get(list_url) if response is None: return html = etree.HTML(response) urls = html.xpath('//dd/a[contains(@href,"content.html")]/@href') for url in urls: detail_parse('http://www.chinatax.gov.cn/' + str(url).replace('../', ''))
def list_parse(): while True: list_url = save.pop(name=dbname) if list_url is not None: response = request.get(list_url) if response is not None: html = etree.HTML(response) urls = html.xpath( '//div[@class="ztb_list_right"]/ul/li/a/@href') for url in urls: detail_parse('http://www.cqzb.gov.cn/' + url) else: break
def list_parse(): while True: list_url = save.pop(name=dbname) if list_url is not None: response = request.get(list_url, header={ 'Accept': '*/*', }) if response is not None: ids = re.findall('"Id":(\d{1,10}),', response) for id in ids: detail_parse(id) else: break
def list_parse(): while True: list_url = save.pop(name=dbname) if list_url is not None: response = request.get(list_url) if response is not None: html = etree.HTML(response) if html is not None: urls = html.xpath( '//td[@align="left"]/a[@target="_blank"]/@href') for url in urls: detail_parse('http://www.hnggzy.com' + url) else: break
def list_parse(): while True: list_url = save.pop(name=dbname) if list_url is None: break response = request.get(list_url) if response is None: return # http://zbtb.com.cn/caigou/zhaobiao-c1762379.html html = etree.HTML(response) urls = html.xpath('//td[@class="td_left"]/a/@href') for url in urls: area_names = html.xpath('//a[@href="{}"]/../preceding-sibling::td//text()'.format(url)) area_name = area_names[0] if len(area_names)!=0 else '全国' detail_parse(url+'**'+area_name)
def list_parse(): while True: list_url = save.pop(name=dbname) if list_url is None: break response = request.get(list_url) if response is None: return html = etree.HTML(response) urls = html.xpath('//td[@align="left"]/a/@href') for url in urls: dates = html.xpath('//a[@href="{}"]/../../td[last()]/text()'.format(url)) date = dates[0] if len(dates)!=0 else None detail_parse(url+'**'+date)
def list_parse(): while True: list_url = save.pop(name=dbname) if list_url is not None: response = request.get(list_url) if response is not None: # print(response) html = etree.HTML(response) urls = html.xpath('//li//a/@href') # print(urls) for url in urls: # print('http://www.jlszfcg.gov.cn' + url) detail_parse('http://www.jlszfcg.gov.cn' + str(url).replace('../', '/')) else: break
def list_parse(): while True: list_url = save.pop(name=dbname) if list_url is not None: response = request.get(list_url) if response is not None: html = etree.HTML(response) urls = html.xpath('//td/a/@href') for url in urls: times = html.xpath('//a[@href="{}"]/../following-sibling::td/text()'.format(url)) titles = html.xpath('//a[@href="{}"]/text()'.format(url)) if len(titles) != 0: url = 'http://www.gzggzy.cn' + url + '***' + times[0] + '***' + titles[0] if len( times) != 0 else '0000-00-00' detail_parse(url) else: break
def list_parse(): while True: list_url = save.pop(name=dbname) if list_url is not None: response = request.get(list_url) if response is not None: html = etree.HTML(response) urls = html.xpath('//ul[@class="list"]/li/a/@href') for url in urls: if 'zbcjgg_zf' in list_url: url = 'http://ggzy.sz.gov.cn/jyxx/zfcgxm/zbcjgg_zf' + str( url).replace('./', '/') else: url = 'http://ggzy.sz.gov.cn/jyxx/zfcgxm/zbgg_zf' + str( url).replace('./', '/') detail_parse(url) else: break
def list_parse(): while True: list_url = save.pop(name=dbname) if list_url is not None: response = request.get(list_url) if response is not None: html = etree.HTML(response) urls = html.xpath('//tr[contains(@class,"pt2")]/@onclick') for url in urls: titles = html.xpath('//tr[@onclick="{}"]//a/text()'.format(url)) if len(titles) != 0: title = titles[0] detail_parse( 'http://www.tyzfcg.gov.cn/view.php?app=bidDetail&id=' + str(url).strip('fDetail(').strip( ')') + '**' + title) else: break