Exemplo n.º 1
0
 def __init__(self,htmlph,xmlph,wsplitph,wbpath):
     reload(sys)
     sys.setdefaultencoding('utf-8')
     self.ict=Ictclas('ICTCLAS50/') 
     self.wordbar=wordlist()#wordBar
     self.spword='@[email protected]' #区分内容的关键字
     #设定相应路径
     self.htmlph=htmlph
     self.xmlph=xmlph
     self.wsplitph=wsplitph
     self.wbpath=wbpath
Exemplo n.º 2
0
    def __init__(self,site_id):
        '''
        初始化各项目录
        '''
        self.path = path(site_id)

        self.ict=Ictclas('ICTCLAS50/') 
        #self.wordbar=wordlist()#wordBar
        self.spword='@[email protected]' 
        
        self.xmlph=self.path.g_document()
        self.wsplitph=self.path.g_wordsplit()
        self.wbpath=self.path.g_wordbar()

        #初始化词库 
        self.wordbar = Thes.Create_Thesaurus(self.wbpath)

        #数据库相关
        self.cx = sq.connect(self.path.g_chun_sqlite())
        self.cu = self.cx.cursor()
Exemplo n.º 3
0
    def __init__(self,pageph,hitph):
        'init'
        self.ict=Ictclas('ICTCLAS50/') 
        self.hitph=hitph
        self.pageph=pageph
        self.hitdoclist=Hitlist() #得分统计列表
        self.wordbar=wordbar('../store/wordbar') #词库 以便得到wordID
        #hithash相关
        self.hithasher=InitHashWid('../store/sortedwidhits','../store/hithash')
        self.hithasher.initHashWid()#初始化hithash
        #init rank total 单个doc的score总和
        self.ranktotal=InitRankTotal('../store/sorteddochits','../store/tranks')
        self.ranktotal.initTotalRank()

        self.hits=[]
        #初始化pagerank
        self.pageranker=[]
        self.initPageranker()
        self.inithits()#初始化hits
        self.hithash=self.hithasher.hithash
        self.length=len(self.hits) #hits长度
        #print 'length of hits is',self.length
        #排序
        self.sorter=sorter()
Exemplo n.º 4
0
class parser:
    def __init__(self,htmlph,xmlph,wsplitph,wbpath):
        reload(sys)
        sys.setdefaultencoding('utf-8')
        self.ict=Ictclas('ICTCLAS50/') 
        self.wordbar=wordlist()#wordBar
        self.spword='@[email protected]' #区分内容的关键字
        #设定相应路径
        self.htmlph=htmlph
        self.xmlph=xmlph
        self.wsplitph=wsplitph
        self.wbpath=wbpath

    def transDoc(self):
        '将html源码转化为document文件'
        htmlli=os.listdir(self.htmlph)#取得html路径
        num=0
        for hp in htmlli:
            print hp
            f=open(self.htmlph+'/'+hp)
            c=f.read()
            #自动判别编码 并进行转化
            res=chardet.detect(c)
            print res
            coding=res['encoding']
            #print 'the former coding',coding
            if coding!='utf-8':
                try:
                    c=c.decode(coding)
                except:
                    print 'something wrong'
            collec=collector(c)#开始解析
            f.close()
            f=open(self.xmlph+'/'+hp,'w')
            try:
                f.write(collec.xml(hp).toxml())#写入到新文件中
            except:
                print 'can not trans xml'
            f.close()
            num+=1


    def splitWord(self):
        '将document文件中的各项进行分词后 保存到新文件中'
        spword='@[email protected]'
        docli=os.listdir(self.xmlph+'/')
        num=0
        for dp in docli:
            print dp

            #if num>1:
            #    break
            #num+=1

            f=open(self.xmlph+'/'+dp)
            c=f.read()
            if len(c)<200:
                continue #对空文件忽略
            root=pq(c)#利用pyquery进行处理
            f.close()
            #开始对各栏目进行处理
            bb=''
            title=root('title').eq(0)
            bb+=self.ict.split( title.attr('text').encode('utf-8'))+' '
            bb+=spword
            #b的处理
            b=root('b item')
            length=len(b)
            for i in range(length):
                bb+=self.ict.split( b.eq(i).attr('text').encode('utf-8'))+' ' 
            bb+=spword
            #h1
            b=root('h1 item')
            length=len(b)
            for i in range(length):
                bb+=self.ict.split( b.eq(i).attr('text').encode('utf-8') )+' '    
            bb+=spword
            #h2
            b=root('h2 item')
            length=len(b)
            for i in range(length):
                bb+=self.ict.split( b.eq(i).attr('text').encode('utf-8') )+' '    
            bb+=spword
            #h3
            b=root('h3 item')
            length=len(b)
            for i in range(length):
                bb+=self.ict.split( b.eq(i).attr('text').encode('utf-8') ) +' '
            bb+=spword
            #a
            b=root('a item')
            length=len(b)
            for i in range(length):
                self.ict.split( b.eq(i).attr('name').encode('utf-8') )+' '
            bb+=spword
            #content
            content=root('content').eq(0)
            #print 'the content is '
            #print content.text()
            bb+=self.ict.split( content.text().encode('utf-8'))+' '
            #print 'the bb is'
            #print bb
            #save the result'''
            f=open(self.wsplitph+'/'+dp,'w+')
            f.write(bb)
            f.close()

    def __wordFind(self,strr):
        #print strr
        words=strr.split()
        flag=re.compile('\d')
        for i in words:
            if len(i)<=10:
                if i.find('=')>-1:
                    continue
                if i.find('.')>-1:
                    continue
                if flag.search(i):
                    continue
                self.wordbar.find(i)

    def transWbar(self):
        '将已经分词的wordxml 分词为 wordBar 并且进行储存'
        li=os.listdir(self.wsplitph)
        for xml in li:
            print xml
            #开始解析分词
            f=open(self.wsplitph+'/'+xml)
            c=f.read()
            f.close()   
            #开始将文本整合 最后对str进行分词
            for i in c.split(self.spword):
                self.__wordFind(i)
            #保存最后词库
        strr=''
        for i in self.wordbar:
            #以字符串的形式保存
            strr+=i+' '
        f=open(self.wbpath,'w')
        f.write(strr)
        f.close()

    def _debug(self):
        f=open(self.wbpath)
        c=f.read()
        for i in c.split():
            print i,hash(i)
Exemplo n.º 5
0
class Parser:
    '''
    解析库
    将下载后的html源码同时转化为document 返回
    部分功能嵌入到spider中
    
    '''
    def __init__(self,site_id):
        '''
        初始化各项目录
        '''
        self.path = path(site_id)

        self.ict=Ictclas('ICTCLAS50/') 
        #self.wordbar=wordlist()#wordBar
        self.spword='@[email protected]' 
        
        self.xmlph=self.path.g_document()
        self.wsplitph=self.path.g_wordsplit()
        self.wbpath=self.path.g_wordbar()

        #初始化词库 
        self.wordbar = Thes.Create_Thesaurus(self.wbpath)

        #数据库相关
        self.cx = sq.connect(self.path.g_chun_sqlite())
        self.cu = self.cx.cursor()


    def splitWord(self):
        '''
        转化为 wordsplit形式
        格式为 <dom str> @[email protected] <dim str>
        直接使用了字符串进行分割
        '''
        spword='@[email protected]'
        docli=os.listdir(self.xmlph+'/')
        num=0
        for dp in docli:
            #print dp

            f=open(self.xmlph+'/'+dp)
            c=f.read()
            if len(c)<200:
                continue 
            root=pq(c)
            f.close()
            
            bb=''
            title=root('title').eq(0)
            bb+=self.ict.split( title.attr('text').encode('utf-8'))+' '
            bb+=spword

            b=root('b item')
            length=len(b)
            for i in range(length):
                bb+=self.ict.split( b.eq(i).attr('text').encode('utf-8'))+' ' 
            bb+=spword
            #h1
            b=root('h1 item')
            length=len(b)
            for i in range(length):
                bb+=self.ict.split( b.eq(i).attr('text').encode('utf-8') )+' '    
            bb+=spword
            #h2
            b=root('h2 item')
            length=len(b)
            for i in range(length):
                bb+=self.ict.split( b.eq(i).attr('text').encode('utf-8') )+' '    
            bb+=spword
            #h3
            b=root('h3 item')
            length=len(b)
            for i in range(length):
                bb+=self.ict.split( b.eq(i).attr('text').encode('utf-8') ) +' '
            bb+=spword
            #a
            b=root('a item')
            length=len(b)
            for i in range(length):
                self.ict.split( b.eq(i).attr('name').encode('utf-8') )+' '
            bb+=spword
            #content
            content=root('content').eq(0)
            #print 'the content is '
            #print content.text()
            bb+=self.ict.split( content.text().encode('utf-8'))+' '
            #print 'the bb is'
            #print bb
            #save the result'''
            f=open(self.wsplitph+'/'+dp,'w+')
            f.write(bb)
            f.close()

    def __wordFind(self,strr):
        #print strr
        words=strr.split()
        flag=re.compile('\d')
        for i in words:
            if len(i)<=10:
                if i.find('=')>-1:
                    continue
                if i.find('.')>-1:
                    continue
                if flag.search(i):
                    continue
                self.wordbar.find(i)

    def transWbar(self):
        '词库初始化'
        li=os.listdir(self.wsplitph)
        for xml in li:
            f=open(self.wsplitph+'/'+xml)
            c=f.read()
            f.close()   
            
            for i in c.split(self.spword):
                self.__wordFind(i)

            #print 'begin to find des'
            '''for i in self.get_split_des_words(xml):
                self.__wordFind(i)
            '''
                
        strr=''
        #for i in self.wordbar.li:
            
            #strr+=i+' '

        f=open(self.wbpath,'w')
        f.write(self.wordbar.get_words())
        f.close()
        
        print 'begin to create hash'

        self.wordbar.create_hash(self.path.g_hash_index())

        self.wordbar.save_wide(self.path.g_word_wide())


    def get_split_des_words(self,docID):
        
        '''
        添加 des 的 hash
        '''
        self.cu.execute("select des from lib where docID = %d"%int(docID))

        li= self.cu.fetchone()

        if li[0]:
            return self.ict.split( str(li[0]) ).split()
        else:
            return ['']
        

    def _debug(self):
        f=open(self.wbpath)
        c=f.read()
        for i in c.split():
            print i,hash(i)
Exemplo n.º 6
0
class Query:
    '查询库'
    def __init__(self,pageph,hitph):
        'init'
        self.ict=Ictclas('ICTCLAS50/') 
        self.hitph=hitph
        self.pageph=pageph
        self.hitdoclist=Hitlist() #得分统计列表
        self.wordbar=wordbar('../store/wordbar') #词库 以便得到wordID
        #hithash相关
        self.hithasher=InitHashWid('../store/sortedwidhits','../store/hithash')
        self.hithasher.initHashWid()#初始化hithash
        #init rank total 单个doc的score总和
        self.ranktotal=InitRankTotal('../store/sorteddochits','../store/tranks')
        self.ranktotal.initTotalRank()

        self.hits=[]
        #初始化pagerank
        self.pageranker=[]
        self.initPageranker()
        self.inithits()#初始化hits
        self.hithash=self.hithasher.hithash
        self.length=len(self.hits) #hits长度
        #print 'length of hits is',self.length
        #排序
        self.sorter=sorter()

    def initPageranker(self):
        print 'init pageranker'
        f=open(self.pageph)
        lines=f.readlines()
        f.close()
        for l in lines:
            self.pageranker.append(float(l))
        

    def inithits(self):
        f=open(self.hitph)
        lines=f.readlines()
        f.close()
        for l in lines:
            self.hits.append(l.split())

    def query(self,strr):
        '单个查询'
        words=self.wordsplit(strr) #分词后的查询结果
        #print '分词结果为',words
        for word in words.split():
            #对每个word进行处理
            print '--start to query word--',word
            wordid=self.wordbar.find(word) #需要查询的wordID
            print '查得的wordID为',wordid
            if wordid:

                hithashpos=self.hithasher.find([wordid,0]) #hithasher返回的为目标数据在hithash中的位置

                if hithashpos:

                    starthitpos=int(self.hithash[hithashpos][1])
                    print '查得的hitpos为',starthitpos
                    #得到wordID在hits表中的片段地址 starthitpos  endhitpos
                    print '开始地址',starthitpos

                    if starthitpos+1<self.length:
                        endhitpos=int(self.hithash[hithashpos+1][1])-1
                    else:
                        endhitpos=starthitpos
                else:
                    continue
            else:
                continue

            #开始扫描片段 进行加分计算
            index=starthitpos
            print '结束地址',endhitpos

            while index<=endhitpos:
                #开始加分处理
                self.hitdoclist.find(self.hits[index])
                index+=1

        #对结尾进行还原
        print '对结尾进行还原'
        self.hitdoclist.InitStatus()

        print 'the former doclist---------------------------'
        for i in self.hitdoclist:
            print i

        #将score转化为相对score
        #print '开始转为相对score'

        for i,score in enumerate(self.hitdoclist):

            #调整精度
            getcontext().prec = 6
            docid=score[0]
            rankpos=self.ranktotal.find([docid,0])#返回记录的位置
            perrank=0 #对于每个记录最终的page值综合

            for j,total in enumerate(self.ranktotal.tranks[rankpos]):
                #开始对每个总值进行扫描 将最终结果保存到 self.hitdoclist[i][-1]中

                if j>0:

                    ranktotal=self.ranktotal.tranks[rankpos][j]
                    
                    if int(ranktotal)==0:
                        self.hitdoclist[i][j]=0
                    else:
                        self.hitdoclist[i][j]=float(score[j])/float(ranktotal)

            #开始将每个标签的rank添加到总rank中
            #title:9 h1:3 h2:2 h3:2 b:1 a:0.5 content:0.5
            #开始加入pageranker
            print 'now calculate the pageranker with the result'
            print 'the docid is',self.hitdoclist[i][0]

            #self.hitdoclist[i][-1]= self.pageranker[ int(self.hitdoclist[i][0])]*(  self.hitdoclist[i][1]*0.5 + self.hitdoclist[i][2]*0.056+ self.hitdoclist[i][3]*0.167 +self.hitdoclist[i][4]*0.11 + self.hitdoclist[i][5]*0.11  +self.hitdoclist[i][6]*0.027 +self.hitdoclist[i][7]*0.027 )
            self.hitdoclist[i][-1]=self.hitdoclist[i][1]
            for k,summ in enumerate(self.hitdoclist[i]):
                if k>0:
                    self.hitdoclist[i][-1]+=summ

        print 'start to print the former hitdoclist'
        for i in self.hitdoclist:
            print i
        self.sorter.run(self.hitdoclist)
        print 'the result'
        self.sorter.showlist()
        #return self.getResList() #返回结果字符串 给服务器

    def wordsplit(self,sentence):
        '将查询语句分词'
        return self.ict.split(sentence)

    def getResList(self):
        strr=''
        for i in self.hitdoclist:
            strr+=str(i[0])+' '
        return strr
Exemplo n.º 7
0
# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf-8')




from ICTCLAS50.Ictclas import Ictclas

import chardet as cdt

ict=Ictclas('ICTCLAS50/') 

words = ict.split("中国农业大学")

print words

for w in words:
    print w
    print cdt.detect(w)