示例#1
0
class DaiYiCha(object):
    def __init__(self):
        self.spyder = WebSpyder()

    #下载详细数据
    def get_ajax_info(self, p2p_name, outpath='p2p_info/'):
        assert (isinstance(p2p_name, unicode))
        url = 'http://www.daiyicha.com/cha.php?view=show&word=' + urllib.quote(
            str(p2p_name.encode('gb2312')))
        data = self.spyder.get_data(url)
        start = data.find(
            'jq.getJSON("plugin.php?id=lonvoy_siteinfo:ax", ') + len(
                'jq.getJSON("plugin.php?id=lonvoy_siteinfo:ax", ')
        tmp_data = data[start:]
        tmp_data = tmp_data[:tmp_data.find(', function (json)')]
        del data
        tmp_data = tmp_data.replace(':', '":')
        tmp_data = tmp_data.replace(',', ',"')
        tmp_data = tmp_data.replace('{', '{"')

        data_dict = json.loads(tmp_data)
        data_url = 'http://www.daiyicha.com/plugin.php?id=lonvoy_siteinfo:ax'
        for k, v in data_dict.iteritems():
            data_url += '&' + str(k).strip() + '=' + urllib.quote(
                str(v.encode('gb2312')))

        total_data = self.spyder.get_data(data_url)
        f = open(outpath + p2p_name + '.txt', 'w')
        f.write(total_data)
        f.close()
示例#2
0
文件: DaiYiCha.py 项目: TimePi/Python
class DaiYiCha(object):
    def __init__(self):
        self.spyder = WebSpyder()
    
    #下载详细数据
    def get_ajax_info(self,p2p_name,outpath='p2p_info/'):
        assert(isinstance(p2p_name,unicode))
        url = 'http://www.daiyicha.com/cha.php?view=show&word=' + urllib.quote(str(p2p_name.encode('gb2312')))
        data = self.spyder.get_data(url)
        start = data.find('jq.getJSON("plugin.php?id=lonvoy_siteinfo:ax", ')+len('jq.getJSON("plugin.php?id=lonvoy_siteinfo:ax", ')
        tmp_data = data[start:]
        tmp_data = tmp_data[:tmp_data.find(', function (json)')]
        del data
        tmp_data = tmp_data.replace(':','":')
        tmp_data = tmp_data.replace(',',',"')
        tmp_data = tmp_data.replace('{','{"')
        
        data_dict = json.loads(tmp_data)
        data_url = 'http://www.daiyicha.com/plugin.php?id=lonvoy_siteinfo:ax'
        for k,v in data_dict.iteritems():
            data_url += '&'+str(k).strip()+'='+urllib.quote(str(v.encode('gb2312')))
        
        total_data = self.spyder.get_data(data_url)
        f = open(outpath+p2p_name+'.txt','w')
        f.write(total_data)
        f.close()
示例#3
0
 def __init__(self,parser,name='Spyder_Runner'):
     self.name = name
     #列表也页面的任务
     self.list_jobs = Queue.Queue(maxsize = 100)
     
     #详情页面的任务
     self.detail_jobs = Queue.Queue(maxsize = 10000)
     
     #下载的爬虫
     self.downloader = WebSpyder()
     
     #解析的程序
     self.parser = parser
示例#4
0
 def __init__(self,spyder = None):
     self.spyder = spyder
     if not self.spyder:
         self.spyder = WebSpyder()
     self.dictionary = {
                 u"北京": ("110000", u"北京市"), u"天津": ("120000", u"天津市"), u"河北": ("130000", u"河北省"), u"山西": ("140000", u"山西省"),u"内蒙古": ("150000", u"内蒙古"),
                 u"辽宁": ("210000", u"辽宁省"), u"吉林": ("220000", u"吉林省"), u"黑龙江": ("230000", u"黑龙江省"),
                 u"上海": ("310000", u"上海市"), u"江苏": ("320000", u"江苏省"), u"浙江": ("330000", u"浙江省"), u"安徽": ("340000", u"安徽省"), u"福建": ("350000", u"福建省"), u"江西": ("360000", u"江西省"), u"山东": ("370000", u"山东省"),
                 u"河南": ("410000", u"河南省"), u"湖北": ("420000", u"湖北省"), u"湖南": ("430000", u"湖南省"), u"广东": ("440000", u"广东省"), u"广西": ("450000", u"广西"), u"海南": ("460000", u"海南省"),
                 u"重庆": ("500000", u"重庆市"), u"四川": ("510000", u"四川省"), u"贵州": ("520000", u"贵州省"), u"云南": ("530000", u"云南省"), u"西藏": ("540000", u"西藏"),
                 u"陕西": ("610000", u"陕西省"), u"甘肃": ("620000", u"甘肃省"), u"青海": ("630000", u"青海省"), u"宁夏": ("640000", u"宁夏"), u"新疆": ("650000", u"新疆"),
                 "":"",
                 }
示例#5
0
class Spyder_Runner(object):
    def __init__(self,parser,name='Spyder_Runner'):
        self.name = name
        #列表也页面的任务
        self.list_jobs = Queue.Queue(maxsize = 100)
        
        #详情页面的任务
        self.detail_jobs = Queue.Queue(maxsize = 10000)
        
        #下载的爬虫
        self.downloader = WebSpyder()
        
        #解析的程序
        self.parser = parser
        
    #添加任务
    def add_list_job(self,list_url):
        if self.parser.is_myurl(list_url):
            self.list_jobs.put(list_url)
    
    #是否要加入list表
    def __is_add_to_list_job__(self,list_url):
        return True
    
    #添加详情页面任务
    def add_detail_job(self,details):
        for detail in filter(lambda x:self.parser.is_myurl(x),details):
            self.add_detail_job.put(detail)
            
    #运行
    def run(self):
        while True:
            #处理列表页
            try:
                list_url = self.list_jobs.get(timeout = 5)
                if list_url:
                    htmldata = self.downloader.get_htmldata(list_url)
                    self.add_detail_job(self.parser.parse_list(htmldata))
                    
                    #获得下一个列表网址
                    next_list_url = self.parser.next_list_url(list_url)
                    
                    #判断是不是需要加入的下载列表中去
                    if self.__is_add_to_list_job__(next_list_url):
                        self.add_list_job(next_list_url)
            except Exception as e:
                print e
            
            #处理详情页
            try:
                while self.detail_jobs.qsize() > 0:
                    detail = self.detail_jobs.get(timeout=5)
                    if detail:
                        print self.parser.parse_detail(detail)
                    
            except Exception as e:
                print e
示例#6
0
class SuperBabySpyder(object):
    def __init__(self):
        self.spyder = WebSpyder()
    
    def get_data(self,url):
        return self.spyder.get_data(url)
    
    def parse_list(self,data):
        pass
    
    def parse_deatail(self,data):
        pass
示例#7
0
 def __init__(self):
     self.spyder = WebSpyder()
示例#8
0
 def __init__(self):
     self.spyder = WebSpyder()
示例#9
0
"""
Created on Thu Apr 28 13:32:38 2016

@author: gong

@description:下载“佛山法院网”中的“破产公告”,“中国破产资产网”中的“行业新闻”,“中国法院网”中的“法院公告”整理破产企业信息

"""
import StringIO,gzip
import time
import json
import datetime
from WebSpyder import WebSpyder
from bs4 import BeautifulSoup

webspyder = WebSpyder()
<<<<<<< HEAD
DAY = 5
=======
DAY = 10
>>>>>>> origin/master

#解压gzip  
def gzdecode(data) :  
    compressedstream = StringIO.StringIO(data)  
    gziper = gzip.GzipFile(fileobj=compressedstream)    
    data2 = gziper.read()   # 读取解压缩后数据   
    return data2
    
def get_foshan(spyder = webspyder):
    #先获得时间数组格式的日期
示例#10
0
class ZhengXin11315(object):
    def __init__(self,spyder = None):
        self.spyder = spyder
        if not self.spyder:
            self.spyder = WebSpyder()
        self.dictionary = {
                    u"北京": ("110000", u"北京市"), u"天津": ("120000", u"天津市"), u"河北": ("130000", u"河北省"), u"山西": ("140000", u"山西省"),u"内蒙古": ("150000", u"内蒙古"),
                    u"辽宁": ("210000", u"辽宁省"), u"吉林": ("220000", u"吉林省"), u"黑龙江": ("230000", u"黑龙江省"),
                    u"上海": ("310000", u"上海市"), u"江苏": ("320000", u"江苏省"), u"浙江": ("330000", u"浙江省"), u"安徽": ("340000", u"安徽省"), u"福建": ("350000", u"福建省"), u"江西": ("360000", u"江西省"), u"山东": ("370000", u"山东省"),
                    u"河南": ("410000", u"河南省"), u"湖北": ("420000", u"湖北省"), u"湖南": ("430000", u"湖南省"), u"广东": ("440000", u"广东省"), u"广西": ("450000", u"广西"), u"海南": ("460000", u"海南省"),
                    u"重庆": ("500000", u"重庆市"), u"四川": ("510000", u"四川省"), u"贵州": ("520000", u"贵州省"), u"云南": ("530000", u"云南省"), u"西藏": ("540000", u"西藏"),
                    u"陕西": ("610000", u"陕西省"), u"甘肃": ("620000", u"甘肃省"), u"青海": ("630000", u"青海省"), u"宁夏": ("640000", u"宁夏"), u"新疆": ("650000", u"新疆"),
                    "":"",
                    }
        
    #最长公共子串
    def max_substring(self, strA, strB):
        #计算呈现条目与搜索企业的相似度
        lenA, lenB = len(strA), len(strB)
        c = [[0 for i in range(lenB)] for j in range(lenA)]
        
        #初始化
        for i in range(lenB):
            if strA[0] == strB[i]:c[0][i] = 1
            else:c[0][i] = 0 if i == 0 else c[0][i-1]
        
        for i in range(lenA):
            if strA[i] == strB[0]:c[i][0] = 1
            else:c[i][0] = 0 if i== 0 else c[i-1][0]
        
        for i in range(1,lenB):
            for j in range(1,lenA):
                if strA[j] == strB[i]:c[i][j] = 1+c[i-1][j-1]
            else:c[i][j] = max(c[i][j-1],c[i-1][j])
        
        return c[lenB-1][lenA-1]
    
    #解压gzip  
    def gzdecode(self,data) :  
        compressedstream = StringIO.StringIO(data)  
        gziper = gzip.GzipFile(fileobj=compressedstream)    
        data2 = gziper.read()   # 读取解压缩后数据   
        return data2
        
    #编辑距离
    def string_distance(self, strA, strB):
        #计算呈现条目与搜索企业的相似度
        lenA, lenB = len(strA), len(strB)
        c = [[0 for i in range(lenB+1)] for j in range(lenA+1)]
        
        for i in range(lenA): c[i][lenB] = lenA - i
        for i in range(lenB): c[lenA][i] = lenB - j
        c[lenA][lenB] = 0
        
        for i in range(lenA-1, -1, -1):
            for j in range(lenB-1, -1, -1):
                if strB[j] == strA[i]: c[i][j] = c[i+1][j+1]
                else: c[i][j] = min(c[i][j+1], c[i+1][j], c[i+1][j+1]) + 1
        
        return c[0][0]
    
    def get_content(self, name, province=''):
        url = "http://www.11315.com/newSearch?regionMc=%s&regionDm=%s&searchType=1&searchTypeHead=1&name=%s"
        
        keyword_n = urllib.quote(name.encode('utf-8'))
        keyword_p = ''
        keyword_id = ''
        if province:
            region = self.dictionary[province]
            keyword_p = urllib.quote(region[1].encode('utf-8'))
            keyword_id = region[0]
        
        url = url % (keyword_p ,keyword_id ,keyword_n)
        if isinstance(self.spyder,WebSpyder):
            return self.gzdecode(self.spyder.get_htmldata(url))
        else:
            return self.spyder.get_htmldata(url)
        
    def match(self, content, name):
        #找出最可能匹配的条目
        distance = []
        records = [i for i in content.find_all("div") if i["class"][0] == u"innerBox"]
        for i in records:
            distance.append(self.string_distance(name, i.find_all("a")[1].text))
        target = 0
        mindis = len(name)
        for i in range(len(records)):
            if distance[i] < mindis:
                mindis = distance[i]
                target = i
        
        #信用网址,之后抓取此页
        crediturl = records[target].find_all("td")[1].text.strip()
        delegate = records[target].find_all("td")[2].text.strip()       #法人在之后的页面通常为一图片,这里提前把文本抓好
                
        return crediturl, delegate
        
    #写入到文件
    def to_file(self,dict_data,filename):
        for k,v in dict_data.iteritems():
            print k,v
        f = open(filename,'a')
        f.write(json.dumps(dict_data).encode('utf8')+'\n')
        f.close()
        
    #Query类的包裹函数
    def search(self,name,outfile, province=''):
        try:
            #if province=='': province = "选择地区"
            #return query.get_content()
            html = self.get_content(name, province)

            if html.find(u'系统检测到您的请求存在异常') >= 0:
                print u'IP被网站封了,oh yeah!\n'
                return None
            content = bs4.BeautifulSoup(html, "html.parser").find("div", id="main")
            
            records_num = int(content.find("p").a.text)
            if records_num==0: return -1 #未查询成功
            
            crediturl, delegate = self.match(content, name) #delegate尚未用到
            
            if isinstance(self.spyder,WebSpyder):
                creditdata = self.gzdecode(self.spyder.get_htmldata(crediturl))
            else:
                creditdata = self.spyder.get_htmldata(crediturl)
            
            if creditdata.find(u'系统检测到您的请求存在异常') >= 0:
                print u'IP被网站封了,oh yeah!\n'
                return None
            #print creditdata
            result_1 = ParseDetail.parse_datail(creditdata)
            if isinstance(self.spyder,WebSpyder):
                deepdata = self.gzdecode(self.spyder.get_htmldata(crediturl+result_1[u'更多信息']))
            else:
                deepdata = self.spyder.get_htmldata(crediturl+result_1[u'更多信息'])
            
            result_2 = ParseDetail.deep_detail(deepdata)
            result_1[u'企业法人'] = delegate
            result_1[u'主营产品'] = result_2[u'主营产品']
            result_1[u'公司介绍'] = result_2[u'公司介绍']
            result_1.iteritems
            self.to_file(result_1,outfile)
        except:
            traceback.print_exc()
示例#11
0
"""
import os
from WebSpyder import WebSpyder
from urllib import urlencode
from bs4 import BeautifulSoup
import traceback
import pandas as pd
from multiprocessing.dummy import Pool as ThreadPool
from extract_company_name import segmentation
#全局的字典
global P2P_DICT
P2P_DICT = None

#全局爬虫
global SPYDER
SPYDER = WebSpyder()


#解析必应的返回数据
def parse_bing_data(data):
    soup = BeautifulSoup(data, 'lxml')
    soup.find('ol', attrs={'id': 'b_results'}).findAll('')


#获得p2p的相关信息和链接
def get_all_p2p_names(spyder, outfile='p2p.csv', max_list=92):
    url = 'http://www.rjb777.com/a/pingtai/list_%s.html'
    result = []
    for i in xrange(1, max_list + 1):
        tmp_url = url % i
        print 'Now processing %s!' % tmp_url