Exemplo n.º 1
0
Arquivo: pextractor.py Projeto: fay/wt
 def __init__(self):
     self.loader = CixingDictLoader()
     self.dict = self.loader.load()
Exemplo n.º 2
0
Arquivo: pextractor.py Projeto: fay/wt
class PhraseExtractor(object):
    def __init__(self):
        self.loader = CixingDictLoader()
        self.dict = self.loader.load()
    def extract(self, context):
        # 这个text可能就是一个字符串,但也可能是一个单词(中文、英文混杂)的list
        text = context.tokens
        text_inversed = text[:: - 1]
        suffix = suffixsorter.build_suffix_array(text)
        context.suffix = suffix
        # 得到text逆序后的suffix array
        suffix_inversed = suffixsorter.build_suffix_array(text_inversed)
        lcp = suffixsorter.calculateLcp(text, suffix)
        context.lcp = lcp
        lcp_inversed = suffixsorter.calculateLcp(text_inversed, suffix_inversed)
        # 得到right complete substring
        rcs = self.rcs(lcp,context)
        # sort rcs by id
        # (it works because suffix array is already sorted 
        # and the rcs list just have changed a little due to the stack's effect in intersect_lcs_rcs)
        rcs.sort()
        #for i in rcs:
            #print i.id,context.tokens[context.suffix[i.id]:context.suffix[i.id]+context.lcp[i.id]],self.list2str(context.tokens[context.suffix[i.id]:context.suffix[i.id]+context.lcp[i.id]]),i.freq
        # 得到left complete substring
        lcs = self.rcs(lcp_inversed,context)
        rcs_ordered = []
        lcs_ordered = []
        for item in rcs:
            rcs_ordered.append(text[suffix[item.id]:suffix[item.id] + lcp[item.id]])
        for item in lcs:
            lcs_ordered.append(text_inversed[suffix_inversed[item.id]:suffix_inversed[item.id] + lcp_inversed[item.id]][:: - 1])
        #rcs needn't sort now
        #rcs_ordered.sort()
        lcs_ordered.sort()
        results = self.intersect_lcs_rcs(rcs, lcs, rcs_ordered, lcs_ordered,context)
        return results
        
    # get right complete substring    
    def rcs(self, lcp,context):
        
        N = len(lcp)
        result = []
        stack = range(N - 1)        
        sp = - 1
        i = 1
        while i < N:
            if sp < 0:
                if ((lcp[i]==1 and context.token_types[context.suffix[i]]=='<ALPHANUM>' and len(context.tokens[context.suffix[i]]) >= 3) 
                    or (lcp[i] >= MIN_PHRASE_LEN)) and lcp[i] <= MAX_PHRASE_LEN:
                    sp += 1
                    stack[sp] = Substring(id=i, freq=2)
                i += 1
            else:
                r = stack[sp].id 
                # 如果小于则表明有新的子串出现
                if lcp[r] < lcp[i]:
                    sp += 1
                    stack[sp] = Substring(id=i, freq=2)
                    i += 1
                elif lcp[r] == lcp[i]:
                # 如果相等,则必然是同substring,因为现在是按字母顺序的有序排列
                    stack[sp].freq += 1
                    i += 1
                else:
                # 如果大于,当前堆栈中的substring已经是最后一个,可以输出到结果中
                    result.append(stack[sp])
                    f = stack[sp].freq
                    sp -= 1
                    if sp >= 0:
                        stack[sp].freq = stack[sp].freq + f - 1
                    if lcp[i] >= MIN_PHRASE_LEN and lcp[i] <= MAX_PHRASE_LEN and sp < 0:
                        sp += 1
                        stack[sp] = Substring(id=i, freq=2 + f - 1)
                        i += 1
                    
       
        return result
    #将list中的字符连接成一个字符串
    def list2str(self, lst):
        #如果本来就是字符串直接返回
        if type(lst) == str or type(lst) == unicode:
            return lst
        s = ''
        is_alpha = False
        lastchar = ''
        count = 0
        for k in lst:
            if type(k) != str and type(k) != unicode:
                k = k.text 
            tt = re.search('[0-9a-zA-Z]', k)
            # 如果当前term与前面的term都为英文,则之间加个空格
            if is_alpha and tt:
                s += " "
            if not is_alpha and tt and count == 1:
                if len(lastchar) == 1:
                    s=s[1:]
            is_alpha = tt
            lastchar = k
            s += k
            count += 1            
        return s
    def intersect_lcs_rcs(self, rcs, lcs, ordered_rcs, ordered_lcs,context):
        i = 0
        j = 0
        results = []

        tdr = context.term_doc_range
        while i < len(ordered_lcs) and j < len(ordered_rcs):
            # 由于ordered_lcs等是一个list,在python中是不能作为key的(list对象是可变的)
            # 所以这里先把单词list转换为字符串
            l = self.list2str(ordered_lcs[i])
            r = self.list2str(ordered_rcs[j])
            # 找到lcs,rcs的交集
            if l == r:
                # 词性是副词连接词代词等的忽略
                if self.dict.has_key(l):
                    i += 1
                    j += 1
                    continue
                # 一方面保证短小的标签适合作为候选,一方面为了防止大量rss文章中带有广告性质的垃圾信息
                if len(re.sub('[a-zA-Z0-9]','',l)) >= MAX_CHINESE_LABEL_LEN:
                    i += 1
                    j += 1
                    continue
                rcs[j].text = l
                # 求出complete substring 在每个文档里的出现freq
                id = rcs[j].id - 1
                lcp = context.lcp[id + 1]
                for m in range(rcs[j].freq):
                    begin = context.suffix[id]  
                    end = context.suffix[id] + lcp
                    n = 0
                    for n in range(len(tdr)):
                        if begin < tdr[n]:
                            break
                    doc_id = n
                    rcs[j].doc_freq[doc_id] = rcs[j].doc_freq.get(doc_id,0) + 1
                    id += 1
                # Q: Why choose rcs's substring as results returned?
                # A: rcs is sorted already while lcs is not    
                results.append(rcs[j])
                i += 1
                j += 1
            elif l < r:
                i += 1
            else:
                j += 1
        return results