Пример #1
0
def curl(mod,act,prama):
	http = HttpWrap()
	http.set_header('Content-Type','application/json')
	body = json.dumps(prama)
	url  = "http://192.168.10.126:6000/%s/%s" % (mod,act)
	res = http.request(url,'POST',body)
	res = json.loads(http.read(res))
	#print(json.dumps(res,indent=1))
	pprint(res)
Пример #2
0
def format_html(url,proxyinfo=""):
	"""内容提取
	"""
	result = {}
	data = {}
	boss_info=[]
	keys_list = []
	value_list = []
	
	#pdb.set_trace()
	http = HttpWrap()
	if proxyinfo:
		http.set_proxy({'http':proxyinfo})
	res = http.request(url)
	if res.code !=200:
		return False
	html = http.read(res)
	try:
		context = etree.HTML(html) 
	except:
		return False
	nodes = context.xpath('//div[@id="jibenxinxi"]//tr')
	result['corp_seq_id']=url.split('=')[1]
	data={}
	keys_list=[]
	value_list=[]
	for node in nodes:
		item=node.getchildren()
		if not(item) or len(item)%2>0:
			continue
		for n in item:
			if n.tag=='th':
				keys_list.append(n.text)
			elif n.tag=='td':
				value_list.append(n.text)
	
	data = dict(map(lambda x,y:[x,y], keys_list,value_list))
	for k,v in data.items():
		if not k :
			continue
		if k in title_base and v:
			result[title_base[k]] = v.strip()
		#else:
		#	print( k,v)
	result['gov_url']=url
	#股东信息
	info = context.xpath('//div[@id="invDiv"]//tr')
	try:
		for node in info:
			boss_info.append(node.getchildren()[1].text.strip())
	except Exception as e:
		traceback.print_exc()
	if boss_info:
		result['shareholders']=json.dumps(boss_info)		

	return result	
Пример #3
0
def format_html(url):
	"""内容提取
	"""
	#pdb.set_trace()	
	result = {}
	data = {}
	boss_info=[]
	keys_list = []
	value_list = []
	
	http = HttpWrap()
	res = http.request(url)
	if res.code !=200:
		return False
	html = http.read(res)
	try:
		context = etree.HTML(html) 
	except:
		return False
	nodes = context.xpath('//table[@class="result-table"][1]//tr')
	for node in nodes:
		item=node.getchildren()
		if not(item) or len(item)%2>0:
			continue
		for i in range(len(item)):
			txt = item[i].text
			if not txt:
				txt=""
			if (i+1)%2==0:
				value_list.append(txt.strip())
			else:
				keys_list.append(txt.strip())
												
	data = dict(map(lambda x,y:[x,y], keys_list,value_list))
	for k,v in data.items():
		if not k :
			continue
		if k in title_base and v:
			result[title_base[k]] = v.strip()
		else:
			print("K:",k,"V:",v)
	result['gov_url']=url
	#股东信息
	info = context.xpath('//table[@id="touziren"]//tr')
	try:
		if len(info) >2:
			info = info[2:]
		for node in info:
			boss_info.append([node.getchildren()[0].text.strip(),node.getchildren()[1].text.strip()])
	except Exception as e:
		traceback.print_exc()
	if boss_info:
		result['shareholders']=json.dumps(boss_info)

	return result	
Пример #4
0
def get_proxy():
	"""提取代理ip
	"""
	http = HttpWrap()
	proxyip=[]
	url = "http://192.168.10.126:1982/cmd/get_proxyip"
	res = http.request(url)
	if res.code==200:
		try:
			proxyip = json.loads(res.read().decode())		
		except:
			pass
	return proxyip
Пример #5
0
def get_stat_data(name,info):
	"""通过配置文件,获取统计数据
	"""
	#url提交模式
	http = HttpWrap()
	http.set_header('Content-type','application/json')
	url = "http://192.168.10.126:1985/api/set"
	
	for i in range(0,len(info['history_from'])):
		itm = info['history_from'][i]
		source = itm['source'].split('.')
		if source[1] == 'sphinx':
			host_info = get_host_by_data(itm['source'])
			if  not host_info :
				return [-1,"key erro %s not in sysconfig." % row['source']]
			
			sp = sphinx(host_info['host'],host_info['port'])
			expression = itm['expression']
			expression['index'] = source[2]
			total_found = 0
			while True:
				if total_found >0:
					if expression['pageSize'] * expression['page'] >=total_found:
						break
					expression['page'] +=1
					
				sp.initQuery(itm['expression'])
				rs = sp.RunQueries()
				if rs and rs[0]['status']==0:
					total_found = rs[0]['total_found']
					_items = {}
					for row in rs[0]['matches']:
						_items["%s%s" % (itm['key_prefix'],row['attrs'][itm['key']])]=[row['attrs'][itm['value']],utils.timestamp(0,'d')]
					if _items:
						data = json.dumps({'gkey':name,'data':_items})
						_rs = http.request(url,"POST",data)
						rs = http.read(_rs)
						print(rs)
				else:
					print(sp._error)
					break
Пример #6
0
def format_html(post_data):
	"""内容提取
	"""
	result = {}
	data = {}
	boss = []
	#pdb.set_trace()
	http = HttpWrap()
	res = http.request(url_info,"POST",post_data)
	if res.code !=200:
		return False
	html = http.read(res)

	try:
		data = json.loads(html)[0]
		result['corp_id']=post_data['id']
		result['corp_org']=post_data['org']
		result['corp_seq_id']=post_data['seq_id']
		for k,v in data.items():
			if k in title_base:
				result[title_base[k]]=v	
	except:
		traceback.print_exc()
		return False
	try:
		url = 'http://www.jsgsj.gov.cn:58888/ecipplatform/ciServlet.json?ciEnter=true'
		data = {'CORP_ID':post_data['id'],'CORP_ORG':post_data['org'],'CORP_SEQ_ID':post_data['seq_id'],'pageNo':1,'pageSize':5,'showRecordLine':1,'specificQuery':'investmentInfor'}	
		res = http.request(url_info,"POST",data)
		html = http.read(res)
		data = json.loads(html)
		for row in data['items']:
			boss.append([row['C1'],row['C2']])
		if boss:
			result['shareholders']=json.dumps(boss)
	except Exception as e:
		pass		

	result['gov_url'] = json.dumps(post_data)
	return result
Пример #7
0
def format_html(url):
	"""内容提取
	"""
	#pdb.set_trace()	
	result = {}
	data = {}
	boss = []
	
	http = HttpWrap()
	res = http.request(url)
	if res.code !=200:
		return False
	html = http.read(res,'b')
	try:
		item = json.loads(html[6:].decode())
		base = item['base']
		if 'investors' in item and item['investors']:
			for row in item['investors']:
				if 'inv' in row: 
					boss.append(row['inv'])
		if boss:
			result['shareholders']=json.dumps(boss)
		#注册资本
		if 'regcap' in base:
			if 'regcapcur' in base:
				base['regcap']="%s万%s" % (base['regcap'],base['regcapcur'])
			else:
				base['regcap']="%s万人民币" % base['regcap']
		for k,v in base.items():
			if k in title_base and v:
				result[title_base[k]] = v.strip()
			#else:
			#	print(k,v)
		result['gov_url'] = url
	except:
		traceback.print_exc()
		return False
	return result	
Пример #8
0
class clt_test:
	def __init__(self):
		self.http 	= HttpWrap()
		self.host	= "http://test.api.biz72.com/index.php?r="
		
	def on_get(self,req,resp,action):
		mod,act = action.split('.')
		if action == "proinfo.search":
			param = {}
			opt = [];
			row = {}
			row["page"]		= 1
			row["pageSize"]	= 20
			row["keyw"]		= "机械"		
			opt.append(row)
			param["param"] 		= opt
			param["field1"]		= "id,title"
			param["field2"]		= "id,desc"
			url = "%s/%s/%s" % (self.host,mod,act)
			res = self.http.request(url,'POST',param)
			res = self.http.read(res)
			res = JSONEncoder().encode(res)
			resp.body = json.dumps(res, indent=1)
Пример #9
0
def get_info(corp, proxyinfo=""):
    socket.setdefaulttimeout(10)
    """采集函数
	参数说明:
		corp  公司名称
		proxyinfo 代理ip  (格式为 ip:port) 为空时使用本机ip
	返回值说明:
	    status  状态码
	    base_info 采集到的工商信息
	"""
    # 状态码 0 正常,1代理ip失效或者网站无法访问 2 ip被封 3公司不存在
    status = 0
    # 基本信息
    base_info = []
    # 股东信息
    boss_info = []

    # pdb.set_trace()
    # http模拟器
    http = HttpWrap()
    # 设定代理ip格式 {"代理类型http|https":"ip:port"}
    if proxyinfo:
        http.set_proxy({"http": proxyinfo})
    res = http.request(url_home, method="GET")
    # 访问主页面用于注册cookie信息,如果无法访问则直接返回失败
    if res.code != 200:
        # print(res.code)
        if res.code > 200:
            ille_proxy_ip.add(proxyinfo)
        return [corp, base_info, 1, proxyinfo]

    """验证过程,循环验证直到成功"""
    # 成功标识
    flag = 0
    html = ""
    cu_time = int(time.time())
    # 出错次数
    err_type = 0
    while flag == 0:
        if datamodel.g_exit:
            return [corp, base_info, 1, proxyinfo]
        try:
            if err_type > 10:
                return [corp, base_info, 1, proxyinfo]
            rand_time = time.strftime("%a %b %d %Y %H:%M:%S GMT 0800")
            url = url_code % time.time()
            res = http.request(url, method="GET")
            data = {}
            # print('step...1')
            if res.code == 200:

                # 保存验证码
                try:
                    im = res.read()
                except:
                    im = ""
                    time.sleep(1)
                    continue
                code = http_upload_image(img_decode_url, im)

                try:
                    code = json.loads(code)
                except Exception as e:
                    # traceback.print_exc()
                    continue

                print(code)
                # 手工输入验证码
                # code = raw_input('input the code:').decode('gbk').encode('utf-8')
                if not code:
                    err_type += 1
                    continue
                data = {"checkNo": request.quote(code)}
                # 重新设置头
                http.reset_headers()
                http.set_header("Accetp", "application/json, text/javascript, */*; q=0.08")
                http.set_header("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8")
                http.set_header("Referer", url_home)
                http.set_header("X-Requested-With", "XMLHttpRequest")
                http.set_header("Referer", "http://www.ahcredit.gov.cn/searchList.jsp")
                res = http.request(url_check, "POST", data)
                # print('step...3')
                if res.code == 200:
                    html = http.read(res)
                    jdata = json.loads(html)
                    # print(jdata)
                    if jdata == "{success:true}":
                        break

                else:
                    err_type += 1
                    # print(res.code)
                    time.sleep(5)
                    # return [corp,base_info,1,proxyinfo]
            else:
                # print(res.code)
                if res.code == 403:
                    time.sleep(20)
                err_type += 1

        except Exception as e:
            traceback.print_exc()
        time.sleep(1)
        # pdb.set_trace()
        # 列表页

    try:
        data = {"checkNo": code, "entName": corp}

        res = http.request(url_list, "POST", data)
        if res.code == -1:
            # print('get html :',res.code)
            return [corp, base_info, 1, proxyinfo]
            # pdb.set_trace()
        html = http.read(res)

        if "无查询结果" in html:
            # print('您搜索的条件无查询结果')
            return [corp, base_info, 3, proxyinfo]

        try:
            context = etree.HTML(html)
        except:
            print(html)
            return [corp, base_info, 1, proxyinfo]
        url_nodes = context.xpath('//div[@class="list"]//a')
        if not url_nodes:
            return [corp, base_info, 1, proxyinfo]
        for url_node in url_nodes:
            try:
                url = "%s%s" % (host, url_node.get("href"))
                _base_info = format_html(url)
                if _base_info:
                    base_info.append(_base_info)
            except Exception as e:
                traceback.print_exc()
                if "reg_no" not in base_info:
                    base_info.append(_base_info)
    except Exception as e:
        traceback.print_exc()
        return [corp, base_info, 1, proxyinfo]
    return [corp, base_info, status, proxyinfo]
Пример #10
0
def get_info(corp,proxyinfo=''):
	if len(corp) <4:
		return [corp,[],3,proxyinfo]
	socket.setdefaulttimeout(10)
	"""采集函数
	参数说明:
		corp  公司名称
		proxyinfo 代理ip  (格式为 ip:port) 为空时使用本机ip
	返回值说明:
	    status  状态码
	    base_info 采集到的工商信息
	"""
	#状态码 0 正常,1代理ip失效或者网站无法访问 2 ip被封 3公司不存在
	status = 0
	#基本信息
	base_info=[]

	#pdb.set_trace()
	#http模拟器
	http = HttpWrap()
	#设定代理ip格式 {"代理类型http|https":"ip:port"}
	if proxyinfo:
		http.set_proxy({'http':proxyinfo})
	res = http.request(url_home,method='GET')
	#访问主页面用于注册cookie信息,如果无法访问则直接返回失败
	if res.code != 200:
		#print(res.code)
		#if res.code>200:
		#	ille_proxy_ip.add(proxyinfo)
		return [corp,base_info,1,proxyinfo]
	
	"""验证过程,循环验证直到成功"""
	#成功标识
	flag = 0	
	html=""
	cu_time = int(time.time())
	#出错次数
	err_type = 0 
	while flag ==0:
		if datamodel.g_exit:
			return [corp,base_info,1,proxyinfo]
		try:
			
			url = url_code % time.time()
			res = http.request(url,method='GET')
			data = {}
			#print('step...1',res.code)
			if res.code == 200:
				#保存验证码
				try:
					im = res.read()
				except:
					continue
				code = http_upload_image(img_decode_url,im)
					
				#手工输入验证码
				#code = raw_input('input the code:').decode('gbk').encode('utf-8')
				#print("code:",code,corp,proxyinfo)
				#print('step...2')
				data={'name':corp,'verifyCode':code}
				#重新设置头
				http.reset_headers()
				http.set_header('Accetp','application/json, text/javascript, */*; q=0.01')
				http.set_header('Content-Type','application/x-www-form-urlencoded; charset=UTF-8')
				http.set_header('Referer',url_home)
				http.set_header('X-Requested-With','XMLHttpRequest')
				res = http.request(url_check,"POST",data)
				#print('step...3')
				if res.code == 200:
					html = http.read(res)
					jdata = json.loads(html)
					#print(jdata)
					if jdata[0]['TIPS'] and 'IP'  in jdata[0]['TIPS']:
						#print(jdata)
						ille_proxy_ip.add(proxyinfo)
						return [corp,base_info,2,proxyinfo]
					if "没有符合查询条件的结果" in jdata[0]['COUNT']:
						return [corp,base_info,3,proxyinfo]
					#	logger.info("iperror:%" % jdata[0]['TIPS'])
					#print ("res:",html)
					if not jdata[0]['TIPS']:
						html = jdata[0]['INFO']
						break
				else:
					err_type+=1
					#return [corp,base_info,1,proxyinfo]
			#elif res.code >200:
				#return [corp,base_info,1,proxyinfo]
			else:
				return [corp,base_info,1,proxyinfo]
				err_type+=1
			if err_type >10 :
				return [corp,base_info,1,proxyinfo]		
		except Exception as e:
			traceback.print_exc()	
		time.sleep(1)
	#pdb.set_trace()
	#列表页

	#取出详情页的url
	if not html:
		return [corp,base_info,1,proxyinfo]
	#print ("html:",html)
	try:
		
		context = etree.HTML(html)
		dt_nodes = context.xpath('//dt')
		dd_nodes = context.xpath('//dd')
		for i in range(0,len(dt_nodes)):

			if dt_nodes[i].text:
				comname = dt_nodes[i].text
				text = etree.tostring(dd_nodes[i],encoding='utf-8').decode()
				base = get_iile_info(text)
				base['name'] = comname
				base_info.append(base)
			else:
				base={}
				link_info = dt_nodes[i].find('a').get('onclick').strip()[12:-2].replace("'",'').split(',')
				url ='http://www.jsgsj.gov.cn:58888%s' % (link_info[0].strip())
				data = {'containContextPath':link_info[5].strip(),'id':link_info[2].strip(),
						'name':'','org':link_info[1].strip(),'reg_no':link_info[4].strip(),'seq_id':link_info[3].strip()}

				#基本资料
				data={'id':link_info[2].strip(),'org':link_info[1].strip(),'seq_id':link_info[3].strip(),'specificQuery':'basicInfo'}
				base = format_html(data)
				if base:
					base_info.append(base)
	except Exception as e:
		traceback.print_exc()

		if  not  base_info:
			return [corp,base_info,1,proxyinfo]
	return [corp,base_info,status,proxyinfo]
Пример #11
0
def get_info(corp,proxyinfo=''):
	if len(corp) <4:
		return [corp,[],3,proxyinfo]
	socket.setdefaulttimeout(10)
	"""采集函数
	参数说明:
		corp  公司名称
		proxyinfo 代理ip  (格式为 ip:port) 为空时使用本机ip
	返回值说明:
	    status  状态码
	    base_info 采集到的工商信息
	"""
	#状态码 0 正常,1代理ip失效或者网站无法访问 2 ip被封 3公司不存在
	status = 0
	#基本信息
	base_info=[]
	#股东信息
	boss_info=[]

	#pdb.set_trace()
	#http模拟器
	http = HttpWrap()
	#设定代理ip格式 {"代理类型http|https":"ip:port"}
	if proxyinfo:
		http.set_proxy({'http':proxyinfo})
	res = http.request(url_home,method='GET')
	#访问主页面用于注册cookie信息,如果无法访问则直接返回失败
	if res.code != 200:
		#print(res.code)
		if res.code>200:
			ille_proxy_ip.add(proxyinfo)
		return [corp,base_info,1,proxyinfo]
	
	"""验证过程,循环验证直到成功"""
	#成功标识
	flag = 0	
	html=""
	cu_time = int(time.time())
	#出错次数
	err_type = 0 
	while flag ==0:
		if datamodel.g_exit:
			return [corp,base_info,1,proxyinfo]
		try:
			if err_type >10 :
				return [corp,base_info,1,proxyinfo]
			
			url = url_code % int(time.time())
			res = http.request(url,method='GET')
			data = {}
			#print('step...1')
			if res.code == 200:
	
				#保存验证码
				try:
					im = res.read()
				except:
					im=''
					time.sleep(1)
					continue

				code = http_upload_image(img_decode_url,im)

				#手工输入验证码
				#code = raw_input('input the code:').decode('gbk').encode('utf-8')
				if not code:
					err_type+=1
					continue
				data={'searchContent':corp,'vcode':code}
				#重新设置头
				http.reset_headers()
				http.set_header('Accetp','application/json, text/javascript, */*; q=0.08')
				http.set_header('Content-Type','application/x-www-form-urlencoded; charset=UTF-8')
				http.set_header('Referer',url_home)
				http.set_header('X-Requested-With','XMLHttpRequest')
				res = http.request(url_check,"POST",data)
				#print('step...3')
				if res.code == 200:
					html = http.read(res)
					#pdb.set_trace()
					if '您查询的信息多于' not in html:
						continue
					if '您查询的信息多于 0 条记录' in html:
						return [corp,base_info,3,proxyinfo]
					flag=1
					break
				else:
					err_type+=1
					#print(res.code)
					time.sleep(5)
					#return [corp,base_info,1,proxyinfo]
			else:
				#print(res.code)
				if res.code == 403:
					time.sleep(20)
				err_type+=1
		
		except Exception as e:
			#traceback.print_exc()	
			err_type+=1
		time.sleep(1)
		if err_type>10:
			return [corp,base_info,1,proxyinfo]

	try:
		context = etree.HTML(html)
		nodes = context.xpath('//div[@class="content"]//a')
		#pdb.set_trace()
		for node in nodes:
			 
			#url = "http://tjcredit.gov.cn%s" % node.get('href')	
			entid = node.get('href').split('=')[1]
			url="http://tjcredit.gov.cn/platform/saic/baseInfo.json?entId=%s&departmentId=scjgw&infoClassId=dj" % entid	

			'''
			res = result.read().decode()
			jurl_result =re.findall('"/platform/saic/topInfoClass.json.*"',res)
			if not jurl_result:
				continue
			jurl = "http://tjcredit.gov.cn%s" % jurl_result[0][1:-1]
			j_result = http.request(jurl)
			if j_result.code !=200:
				continue
			jdata = json.loads(j_result.read().decode())
			base_url = "http://tjcredit.gov.cn%s" % jdata[0]['url']
			result = http.request(base_url)
			
			if result.code !=200:
				continue
			'''
			_base_info = format_html(url)
			if _base_info:
				base_info.append(_base_info)
			#else:
			#	print(html)
	except:
		traceback.print_exc()
		print(url)
		return [corp,base_info,1,proxyinfo]	
	
	return [corp,base_info,status,proxyinfo]
Пример #12
0
def get_info(corp,proxyinfo=''):
	if len(corp) <4:
		return [corp,[],3,proxyinfo]
	socket.setdefaulttimeout(10)
	"""采集函数
	参数说明:
		corp  公司名称
		proxyinfo 代理ip  (格式为 ip:port) 为空时使用本机ip
	返回值说明:
	    status  状态码
	    base_info 采集到的工商信息
	"""
	#状态码 0 正常,1代理ip失效或者网站无法访问 2 ip被封 3公司不存在
	status = 0
	#基本信息
	base_info=[]
	#股东信息
	boss_info=[]
	#pdb.set_trace()
	#http模拟器
	http = HttpWrap()
	#设定代理ip格式 {"代理类型http|https":"ip:port"}
	if proxyinfo:
		http.set_proxy({'http':proxyinfo})
	res = http.request(url_home,method='GET')
	#访问主页面用于注册cookie信息,如果无法访问则直接返回失败
	if res.code != 200:
		#print(res.code)
		if res.code>200:
			ille_proxy_ip.add(proxyinfo)
		return [corp,base_info,1,proxyinfo]
	
	"""验证过程,循环验证直到成功"""
	#成功标识
	flag = 0	
	html=""
	cu_time = int(time.time())
	#出错次数
	err_type = 0 
	while flag ==0:
		if datamodel.g_exit:
			return [corp,base_info,1,proxyinfo]
		try:
			if err_type >10 :
				return [corp,base_info,1,proxyinfo]
			
			url = url_code % int(time.time())
			res = http.request(url,method='GET')
			data = {}
			#print('step...1')
			if res.code == 200:
	
				#保存验证码
				try:
					im = res.read()
				except:
					im=''
					time.sleep(1)
					continue

				code = http_upload_image(img_decode_url,im)

				#print(code)
				#手工输入验证码
				#code = raw_input('input the code:').decode('gbk').encode('utf-8')
				if not code:
					err_type+=1
					continue
				data={'key':corp,'code':code}
				#重新设置头
				http.reset_headers()
				http.set_header('Accetp','application/json, text/javascript, */*; q=0.08')
				http.set_header('Content-Type','application/x-www-form-urlencoded; charset=UTF-8')
				http.set_header('Referer',url_home)
				http.set_header('X-Requested-With','XMLHttpRequest')
				res = http.request(url_check,"POST",data)
				#print('step...3')
				if res.code == 200:
					html = http.read(res)
					if '验证码不正确' in html:
						continue
					if '您搜索的条件无查询结果' in html:
						return [corp,base_info,3,proxyinfo]

					break
				
				else:
					err_type+=1
					#print(res.code)
					time.sleep(5)
					#return [corp,base_info,1,proxyinfo]
			else:
				#print(res.code)
				if res.code == 403:
					time.sleep(20)
				err_type+=1
		
		except Exception as e:
			#traceback.print_exc()	
			err_type+=1
		time.sleep(1)
		if err_type>10:
			return [corp,base_info,1,proxyinfo]
	#提取数据		
	try:
		context = etree.HTML(html)
		nodes = context.xpath('//div[@class="item"]/a')
		#pdb.set_trace()
		for node in nodes:
			_base_info = {}
			entId, opid,entType = (node.get('data-entid'),node.get('data-id'),node.get('data-type'))
			name = node.text.strip()
			
			data = {'entId':entId,'id':opid,'type':entType,'name':name}
			page_res = http.request(url_list,'POST',data)
			page = http.read(page_res)
			page_txt = etree.HTML(page)
			data_type= re.findall('type=\'(\d+)\'',page_txt.get('ng-init'))[0]
			url = url_info % (entId,http.urlencode(opid),data_type)
			_base_info = format_html(url)
			if _base_info:
				base_info.append(_base_info)
		if base_info:
			return [corp,base_info,status,proxyinfo]
	except:
		#traceback.print_exc()
		return [corp,base_info,1,proxyinfo]	
	return [corp,base_info,status,proxyinfo]
Пример #13
0
def get_info(corp,proxyinfo=''):
	socket.setdefaulttimeout(10)
	"""采集函数
	参数说明:
		corp  公司名称
		proxyinfo 代理ip  (格式为 ip:port) 为空时使用本机ip
	返回值说明:
	    status  状态码
	    base_info 采集到的工商信息
	"""
	#状态码 0 正常,1代理ip失效或者网站无法访问 2 ip被封 3公司不存在
	status = 0
	#基本信息
	base_info={}
	#股东信息
	boss_info=[]
    #用于匹配json结果
	title_base={'C2':'name','C1':'reg_no','C3':'type','C4':'reg_date','C5':'faren','C6':'reg_capital','C7':'addr','C8':'biz_scope','C9':'open_date','C10':'close_date','C11':'reg_authority','C12':'audit_date','C13':'reg_status'}

	#pdb.set_trace()
	#http模拟器
	http = HttpWrap()
	#设定代理ip格式 {"代理类型http|https":"ip:port"}
	if proxyinfo:
		http.set_proxy({'http':proxyinfo})
	res = http.request(url_home,method='GET')
	#访问主页面用于注册cookie信息,如果无法访问则直接返回失败
	if res.code != 200:
		#print(res.code)
		if res.code>200:
			ille_proxy_ip.add(proxyinfo)
		return [corp,base_info,1,proxyinfo]
	
	"""验证过程,循环验证直到成功"""
	#成功标识
	flag = 0	
	html=""
	cu_time = int(time.time())
	#出错次数
	err_type = 0 
	while flag ==0:
		#if datamodel.g_exit:
		#	return [corp,base_info,1,proxyinfo]
		try:
			rand_time = time.strftime('%a %b %d %Y %H:%M:%S GMT 0800')
			url = url_code #% rand_time
			res = http.request(url,method='GET')
			data = {}
			#print('step...1')
			if res.code == 200:
	
				#保存验证码
				try:
					im = res.read()
				except:
					im=''
					continue
				code = http_upload_image(img_decode_url,im)
					
				#手工输入验证码
				#code = raw_input('input the code:').decode('gbk').encode('utf-8')
				#print(code)
				#print('step...2')
				data={'name':corp,'verifyCode':code}
				#重新设置头
				http.reset_headers()
				http.set_header('Accetp','application/json, text/javascript, */*; q=0.01')
				http.set_header('Content-Type','application/x-www-form-urlencoded; charset=UTF-8')
				http.set_header('Referer',url_home)
				http.set_header('X-Requested-With','XMLHttpRequest')
				res = http.request(url_check,"POST",data)
				#print('step...3')
				if res.code == 200:
					html = http.read(res)
					jdata = json.loads(html)
					#print(jdata)
					if jdata[0]['TIPS'] and 'IP'  in jdata[0]['TIPS']:
						#print(jdata)
						ille_proxy_ip.add(proxyinfo)
						return [corp,base_info,2,proxyinfo]
					if "没有符合查询条件的结果" in jdata[0]['COUNT']:
						return [corp,base_info,3,proxyinfo]
					#	logger.info("iperror:%" % jdata[0]['TIPS'])
					#print ("res:",html)
					if not jdata[0]['TIPS']:
						html = jdata[0]['INFO']
						break
				else:
					err_type+=1
					#return [corp,base_info,1,proxyinfo]
			else:
				err_type+=1
			if err_type >10 :
				return [corp,base_info,1,proxyinfo]		
		except Exception as e:
			traceback.print_exc()	
		time.sleep(1)
	#pdb.set_trace()
	#列表页

	#取出详情页的url
	if not html:
		return [corp,base_info,1,proxyinfo]
	#print ("html:",html)
	try:
		
		context = etree.HTML(html)
		nodes = context.xpath("//a")
		link_info = nodes[0].attrib['onclick'].strip()[12:-2].replace("'",'').split(',')
		url ='http://www.jsgsj.gov.cn:58888%s' % (link_info[0].strip())
		data = {'containContextPath':link_info[5].strip(),'id':link_info[2].strip(),
			'name':'','org':link_info[1].strip(),'reg_no':link_info[4].strip(),'seq_id':link_info[3].strip()}
					
		#详情页基本资料
		#self.reset_headers()
		#self.set_headers('Content-Type','application/x-www-form-urlencoded; charset=UTF-8')
		#self.set_headers('Accept','text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
		#self.set_headers('Referer','http://www.jsgsj.gov.cn:58888/province/queryResultList.jsp')
		#res = self.request(url)
		###############
		#http.reset_headers()
		'''
		self.set_headers('Accept','application/json, text/javascript, */*; q=0.01')
		self.set_headers('Content-Type','application/x-www-form-urlencoded; charset=UTF-8')
		self.set_headers('X-Requested-With','XMLHttpRequest')
		self.set_headers('Referer',http://www.jsgsj.gov.cn:58888/ecipplatform/inner_pspc/pspc_queryCorpInfor_gsRelease.jsp')
		'''
		http.headers={'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)','Accept-Encoding': 'gzip, deflate','Accept-Language': 'zh-CN'}
		http.headers['Accept']='application/json, text/javascript, */*; q=0.01'
		http.headers['Content-Type'] = 'application/x-www-form-urlencoded; charset=UTF-8'
		http.headers['Referer'] = 'http://www.jsgsj.gov.cn:58888/ecipplatform/inner_pspc/pspc_queryCorpInfor_gsRelease.jsp'
		#基本资料
		url = "http://www.jsgsj.gov.cn:58888/ecipplatform/ciServlet.json?ciEnter=true"
		data={'id':link_info[2].strip(),'org':link_info[1].strip(),'seq_id':link_info[3].strip(),'specificQuery':'basicInfo'}
		base_info['gov_url']=json.dumps(data)
		res = http.request(url,'POST',data)
		#连接出错,做失败处理
		if  res.code==-1:
			print(res.code)
			return [corp,base_info,1,proxyinfo]
		info = res.read().decode()
		#print(res.code,info)
		data = json.loads(info)[0]
		base_info['corp_id']=link_info[2].strip()
		base_info['corp_org']=link_info[1].strip()
		base_info['corp_seq_id']=link_info[3].strip()
		for k,v in data.items():
			if k in title_base:
				base_info[title_base[k]]=v
				#股东信息
		url = 'http://www.jsgsj.gov.cn:58888/ecipplatform/ciServlet.json?ciEnter=true'
		data = {'CORP_ID':link_info[2].strip(),'CORP_ORG':link_info[1].strip(),'CORP_SEQ_ID':link_info[3].strip(),'pageNo':1,'pageSize':5,'showRecordLine':1,'specificQuery':'investmentInfor'}	

		res = http.request(url,'POST',data)
		#取股东出错,放弃股东信息,返回结果
		if res.code !=200:
			return [corp,base_info,status,proxyinfo]
		info = res.read().decode()
		#print(res.code,info)
		try:
			data = json.loads(info)
			for row in data['items']:
				boss_info.append([row['C1'],row['C2']])
			if boss_info:
				base_info['shareholders']=json.dumps(boss_info)
		except Exception as e:
			traceback.print_exc()
	except Exception as e:
		#traceback.print_exc()
		try:
			base_info['name']=corp
			base_info['reg_status']='已注销'
			pe={'reg_no':'注册号:\<span\>(.*?)\<',
				'faren':'法定代表人:\<span\>(.*?)\<|投资人:\<span\>(.*?)\<|经营者:\<span\>(.*?)\<',
				'reg_authority':'登记机关:\<span\>(.*?)\<',
				'cancell_date':'注销日期:\<span\>(.*?)\<|吊销日期:\<span\>(.*?)\<'}
			for k,v in pe.items(): 
				rs = re.findall(v,html)
				if rs:
					base_info[k] = rs[0]
					if type(rs[0]) in [list,tuple]:
						if rs[0][0]:
							base_info[k] = rs[0][0]
						elif rs[0][1]:
							base_info[k] = rs[0][1]
						elif(len(rs[0])>2):
							base_info[k] = rs[0][2]
		except:
			pass
		if 'reg_no' not in base_info:
			return [corp,base_info,1,proxyinfo]
	return [corp,base_info,status,proxyinfo]
Пример #14
0
		if 'reg_no' not in base_info:
			return [corp,base_info,1,proxyinfo]
	return [corp,base_info,status,proxyinfo]

def input_info(res):
	print ("******%s*****"	% res[0])
	print ("基本信息")
	for k,v in res[1].items():
		print (k,v)
		#print "%s:	%s" % (k,v)
	print("-----------------")



				
if __name__ == "__main__":
	
	#img_decode_url="http://127.0.0.1:1983/imgcode/base"

	#res = get_info('常熟市国宇纺织有限公司')
	#proxyinfo = {'http':'117.177.243.50:8080'}
	
	#res = get_info('常熟市兴达机械有限公司',proxyinfo="")
	#input_info(res)
	data={'seq_id': '6', 'specificQuery': 'basicInfo', 'org': '1402', 'id': '1597861'}
	http = HttpWrap()
	url = "http://www.jsgsj.gov.cn:58888/ecipplatform/ciServlet.json?ciEnter=true"
	res = http.request(url,'POST',data)
	print(res.read())
	
Пример #15
0
	def __init__(self):
		self.http 	= HttpWrap()
		self.host	= "http://test.api.biz72.com/index.php?r="