class DaiYiCha(object): def __init__(self): self.spyder = WebSpyder() #下载详细数据 def get_ajax_info(self, p2p_name, outpath='p2p_info/'): assert (isinstance(p2p_name, unicode)) url = 'http://www.daiyicha.com/cha.php?view=show&word=' + urllib.quote( str(p2p_name.encode('gb2312'))) data = self.spyder.get_data(url) start = data.find( 'jq.getJSON("plugin.php?id=lonvoy_siteinfo:ax", ') + len( 'jq.getJSON("plugin.php?id=lonvoy_siteinfo:ax", ') tmp_data = data[start:] tmp_data = tmp_data[:tmp_data.find(', function (json)')] del data tmp_data = tmp_data.replace(':', '":') tmp_data = tmp_data.replace(',', ',"') tmp_data = tmp_data.replace('{', '{"') data_dict = json.loads(tmp_data) data_url = 'http://www.daiyicha.com/plugin.php?id=lonvoy_siteinfo:ax' for k, v in data_dict.iteritems(): data_url += '&' + str(k).strip() + '=' + urllib.quote( str(v.encode('gb2312'))) total_data = self.spyder.get_data(data_url) f = open(outpath + p2p_name + '.txt', 'w') f.write(total_data) f.close()
class DaiYiCha(object): def __init__(self): self.spyder = WebSpyder() #下载详细数据 def get_ajax_info(self,p2p_name,outpath='p2p_info/'): assert(isinstance(p2p_name,unicode)) url = 'http://www.daiyicha.com/cha.php?view=show&word=' + urllib.quote(str(p2p_name.encode('gb2312'))) data = self.spyder.get_data(url) start = data.find('jq.getJSON("plugin.php?id=lonvoy_siteinfo:ax", ')+len('jq.getJSON("plugin.php?id=lonvoy_siteinfo:ax", ') tmp_data = data[start:] tmp_data = tmp_data[:tmp_data.find(', function (json)')] del data tmp_data = tmp_data.replace(':','":') tmp_data = tmp_data.replace(',',',"') tmp_data = tmp_data.replace('{','{"') data_dict = json.loads(tmp_data) data_url = 'http://www.daiyicha.com/plugin.php?id=lonvoy_siteinfo:ax' for k,v in data_dict.iteritems(): data_url += '&'+str(k).strip()+'='+urllib.quote(str(v.encode('gb2312'))) total_data = self.spyder.get_data(data_url) f = open(outpath+p2p_name+'.txt','w') f.write(total_data) f.close()
def __init__(self,parser,name='Spyder_Runner'): self.name = name #列表也页面的任务 self.list_jobs = Queue.Queue(maxsize = 100) #详情页面的任务 self.detail_jobs = Queue.Queue(maxsize = 10000) #下载的爬虫 self.downloader = WebSpyder() #解析的程序 self.parser = parser
def __init__(self,spyder = None): self.spyder = spyder if not self.spyder: self.spyder = WebSpyder() self.dictionary = { u"北京": ("110000", u"北京市"), u"天津": ("120000", u"天津市"), u"河北": ("130000", u"河北省"), u"山西": ("140000", u"山西省"),u"内蒙古": ("150000", u"内蒙古"), u"辽宁": ("210000", u"辽宁省"), u"吉林": ("220000", u"吉林省"), u"黑龙江": ("230000", u"黑龙江省"), u"上海": ("310000", u"上海市"), u"江苏": ("320000", u"江苏省"), u"浙江": ("330000", u"浙江省"), u"安徽": ("340000", u"安徽省"), u"福建": ("350000", u"福建省"), u"江西": ("360000", u"江西省"), u"山东": ("370000", u"山东省"), u"河南": ("410000", u"河南省"), u"湖北": ("420000", u"湖北省"), u"湖南": ("430000", u"湖南省"), u"广东": ("440000", u"广东省"), u"广西": ("450000", u"广西"), u"海南": ("460000", u"海南省"), u"重庆": ("500000", u"重庆市"), u"四川": ("510000", u"四川省"), u"贵州": ("520000", u"贵州省"), u"云南": ("530000", u"云南省"), u"西藏": ("540000", u"西藏"), u"陕西": ("610000", u"陕西省"), u"甘肃": ("620000", u"甘肃省"), u"青海": ("630000", u"青海省"), u"宁夏": ("640000", u"宁夏"), u"新疆": ("650000", u"新疆"), "":"", }
class Spyder_Runner(object): def __init__(self,parser,name='Spyder_Runner'): self.name = name #列表也页面的任务 self.list_jobs = Queue.Queue(maxsize = 100) #详情页面的任务 self.detail_jobs = Queue.Queue(maxsize = 10000) #下载的爬虫 self.downloader = WebSpyder() #解析的程序 self.parser = parser #添加任务 def add_list_job(self,list_url): if self.parser.is_myurl(list_url): self.list_jobs.put(list_url) #是否要加入list表 def __is_add_to_list_job__(self,list_url): return True #添加详情页面任务 def add_detail_job(self,details): for detail in filter(lambda x:self.parser.is_myurl(x),details): self.add_detail_job.put(detail) #运行 def run(self): while True: #处理列表页 try: list_url = self.list_jobs.get(timeout = 5) if list_url: htmldata = self.downloader.get_htmldata(list_url) self.add_detail_job(self.parser.parse_list(htmldata)) #获得下一个列表网址 next_list_url = self.parser.next_list_url(list_url) #判断是不是需要加入的下载列表中去 if self.__is_add_to_list_job__(next_list_url): self.add_list_job(next_list_url) except Exception as e: print e #处理详情页 try: while self.detail_jobs.qsize() > 0: detail = self.detail_jobs.get(timeout=5) if detail: print self.parser.parse_detail(detail) except Exception as e: print e
class SuperBabySpyder(object): def __init__(self): self.spyder = WebSpyder() def get_data(self,url): return self.spyder.get_data(url) def parse_list(self,data): pass def parse_deatail(self,data): pass
def __init__(self): self.spyder = WebSpyder()
""" Created on Thu Apr 28 13:32:38 2016 @author: gong @description:下载“佛山法院网”中的“破产公告”,“中国破产资产网”中的“行业新闻”,“中国法院网”中的“法院公告”整理破产企业信息 """ import StringIO,gzip import time import json import datetime from WebSpyder import WebSpyder from bs4 import BeautifulSoup webspyder = WebSpyder() <<<<<<< HEAD DAY = 5 ======= DAY = 10 >>>>>>> origin/master #解压gzip def gzdecode(data) : compressedstream = StringIO.StringIO(data) gziper = gzip.GzipFile(fileobj=compressedstream) data2 = gziper.read() # 读取解压缩后数据 return data2 def get_foshan(spyder = webspyder): #先获得时间数组格式的日期
class ZhengXin11315(object): def __init__(self,spyder = None): self.spyder = spyder if not self.spyder: self.spyder = WebSpyder() self.dictionary = { u"北京": ("110000", u"北京市"), u"天津": ("120000", u"天津市"), u"河北": ("130000", u"河北省"), u"山西": ("140000", u"山西省"),u"内蒙古": ("150000", u"内蒙古"), u"辽宁": ("210000", u"辽宁省"), u"吉林": ("220000", u"吉林省"), u"黑龙江": ("230000", u"黑龙江省"), u"上海": ("310000", u"上海市"), u"江苏": ("320000", u"江苏省"), u"浙江": ("330000", u"浙江省"), u"安徽": ("340000", u"安徽省"), u"福建": ("350000", u"福建省"), u"江西": ("360000", u"江西省"), u"山东": ("370000", u"山东省"), u"河南": ("410000", u"河南省"), u"湖北": ("420000", u"湖北省"), u"湖南": ("430000", u"湖南省"), u"广东": ("440000", u"广东省"), u"广西": ("450000", u"广西"), u"海南": ("460000", u"海南省"), u"重庆": ("500000", u"重庆市"), u"四川": ("510000", u"四川省"), u"贵州": ("520000", u"贵州省"), u"云南": ("530000", u"云南省"), u"西藏": ("540000", u"西藏"), u"陕西": ("610000", u"陕西省"), u"甘肃": ("620000", u"甘肃省"), u"青海": ("630000", u"青海省"), u"宁夏": ("640000", u"宁夏"), u"新疆": ("650000", u"新疆"), "":"", } #最长公共子串 def max_substring(self, strA, strB): #计算呈现条目与搜索企业的相似度 lenA, lenB = len(strA), len(strB) c = [[0 for i in range(lenB)] for j in range(lenA)] #初始化 for i in range(lenB): if strA[0] == strB[i]:c[0][i] = 1 else:c[0][i] = 0 if i == 0 else c[0][i-1] for i in range(lenA): if strA[i] == strB[0]:c[i][0] = 1 else:c[i][0] = 0 if i== 0 else c[i-1][0] for i in range(1,lenB): for j in range(1,lenA): if strA[j] == strB[i]:c[i][j] = 1+c[i-1][j-1] else:c[i][j] = max(c[i][j-1],c[i-1][j]) return c[lenB-1][lenA-1] #解压gzip def gzdecode(self,data) : compressedstream = StringIO.StringIO(data) gziper = gzip.GzipFile(fileobj=compressedstream) data2 = gziper.read() # 读取解压缩后数据 return data2 #编辑距离 def string_distance(self, strA, strB): #计算呈现条目与搜索企业的相似度 lenA, lenB = len(strA), len(strB) c = [[0 for i in range(lenB+1)] for j in range(lenA+1)] for i in range(lenA): c[i][lenB] = lenA - i for i in range(lenB): c[lenA][i] = lenB - j c[lenA][lenB] = 0 for i in range(lenA-1, -1, -1): for j in range(lenB-1, -1, -1): if strB[j] == strA[i]: c[i][j] = c[i+1][j+1] else: c[i][j] = min(c[i][j+1], c[i+1][j], c[i+1][j+1]) + 1 return c[0][0] def get_content(self, name, province=''): url = "http://www.11315.com/newSearch?regionMc=%s®ionDm=%s&searchType=1&searchTypeHead=1&name=%s" keyword_n = urllib.quote(name.encode('utf-8')) keyword_p = '' keyword_id = '' if province: region = self.dictionary[province] keyword_p = urllib.quote(region[1].encode('utf-8')) keyword_id = region[0] url = url % (keyword_p ,keyword_id ,keyword_n) if isinstance(self.spyder,WebSpyder): return self.gzdecode(self.spyder.get_htmldata(url)) else: return self.spyder.get_htmldata(url) def match(self, content, name): #找出最可能匹配的条目 distance = [] records = [i for i in content.find_all("div") if i["class"][0] == u"innerBox"] for i in records: distance.append(self.string_distance(name, i.find_all("a")[1].text)) target = 0 mindis = len(name) for i in range(len(records)): if distance[i] < mindis: mindis = distance[i] target = i #信用网址,之后抓取此页 crediturl = records[target].find_all("td")[1].text.strip() delegate = records[target].find_all("td")[2].text.strip() #法人在之后的页面通常为一图片,这里提前把文本抓好 return crediturl, delegate #写入到文件 def to_file(self,dict_data,filename): for k,v in dict_data.iteritems(): print k,v f = open(filename,'a') f.write(json.dumps(dict_data).encode('utf8')+'\n') f.close() #Query类的包裹函数 def search(self,name,outfile, province=''): try: #if province=='': province = "选择地区" #return query.get_content() html = self.get_content(name, province) if html.find(u'系统检测到您的请求存在异常') >= 0: print u'IP被网站封了,oh yeah!\n' return None content = bs4.BeautifulSoup(html, "html.parser").find("div", id="main") records_num = int(content.find("p").a.text) if records_num==0: return -1 #未查询成功 crediturl, delegate = self.match(content, name) #delegate尚未用到 if isinstance(self.spyder,WebSpyder): creditdata = self.gzdecode(self.spyder.get_htmldata(crediturl)) else: creditdata = self.spyder.get_htmldata(crediturl) if creditdata.find(u'系统检测到您的请求存在异常') >= 0: print u'IP被网站封了,oh yeah!\n' return None #print creditdata result_1 = ParseDetail.parse_datail(creditdata) if isinstance(self.spyder,WebSpyder): deepdata = self.gzdecode(self.spyder.get_htmldata(crediturl+result_1[u'更多信息'])) else: deepdata = self.spyder.get_htmldata(crediturl+result_1[u'更多信息']) result_2 = ParseDetail.deep_detail(deepdata) result_1[u'企业法人'] = delegate result_1[u'主营产品'] = result_2[u'主营产品'] result_1[u'公司介绍'] = result_2[u'公司介绍'] result_1.iteritems self.to_file(result_1,outfile) except: traceback.print_exc()
""" import os from WebSpyder import WebSpyder from urllib import urlencode from bs4 import BeautifulSoup import traceback import pandas as pd from multiprocessing.dummy import Pool as ThreadPool from extract_company_name import segmentation #全局的字典 global P2P_DICT P2P_DICT = None #全局爬虫 global SPYDER SPYDER = WebSpyder() #解析必应的返回数据 def parse_bing_data(data): soup = BeautifulSoup(data, 'lxml') soup.find('ol', attrs={'id': 'b_results'}).findAll('') #获得p2p的相关信息和链接 def get_all_p2p_names(spyder, outfile='p2p.csv', max_list=92): url = 'http://www.rjb777.com/a/pingtai/list_%s.html' result = [] for i in xrange(1, max_list + 1): tmp_url = url % i print 'Now processing %s!' % tmp_url