示例#1
0
文件: lingo.py 项目: fay/wt
def loadterms():
    ireader = IndexReader.open(STORE_DIR)
    tpv = TermPositionVector.cast_(ireader.getTermFreqVector(0, 'title'))
    a = ireader.terms()
    rownames = []
    # 列名为term的中英文表示
    colnames = []
    # term-freq矩阵
    data = []
    ireader.document(- 1)
    i = 0
    while a.next():
        term = a.term()
        if term.field() == 'summary':
            colnames.append(term.text())
            if term.text() == '':
                print 'ok'
                break
            i = i+1
            if i == 1000:
                break
            docs = ireader.termDocs(term)
            vector = []
            lastdoc = 0
            while docs.next():
                # 填补那些不包含当前term的document的词频为0
                if lastdoc < docs.doc():
                    id = docs.doc()
                    for j in range(id - lastdoc):
                        vector.append(0)
                vector.append(docs.freq())
            data.append(vector)
    ireader.close()
    return colnames, data
示例#2
0
文件: clusters.py 项目: fay/wt
def loadterms():
    ireader = IndexReader.open(STORE_DIR)
    tpv = TermPositionVector.cast_(ireader.getTermFreqVector(0, 'title'))
    a = ireader.terms()
    rownames = []
    colnames = []
    data = []
    print dir(a)
    print dir(ireader)
    ireader.document(- 1)
    while 0 and a.next():
        term = a.term()
        if term.field() == 'summary':
            colnames.append(term.text())
            docs = ireader.termDocs(term)
            vector = []
            lastdoc = 0
            while docs.next():
                if lastdoc < docs.doc():
                    id = docs.doc()
                    for j in range(id - lastdoc):
                        vector.append(0)
                vector.append(docs.freq())
            data.append(vector)  
    return colnames, data
示例#3
0
文件: matrixmapper.py 项目: fay/wt
 def label_assign(self, docs, labels, lucene_ids):
     term_row = {}
     all = []
     ireader = IndexReader.open(STORE_DIR)
     total_terms = 0
     for i in range(len(lucene_ids)):
         tpv = TermPositionVector.cast_(ireader.getTermFreqVector(lucene_ids[i], 'summary'))
         self.add2matrix(tpv, all, term_row, lucene_ids, i)
         tpv = TermPositionVector.cast_(ireader.getTermFreqVector(lucene_ids[i], 'title'))
         self.add2matrix(tpv, all, term_row, lucene_ids, i)
     
     # 对label进行分词            
     analyzer = CJKAnalyzer()
     labelmatrix = zeros((len(all), len(labels)))
     label_term = []
     for i in range(len(labels)):
         if not labels[i].is_candicate_label and len(labels[i].text) >= 3:
             label_term.append([])
             continue
         #print labels[i].text,labels[i].id
         stream = analyzer.tokenStream('', StringReader(labels[i].text))
         terms = []
         for token in stream:
             if term_row.has_key(token.term()):
                 # weighting
                 termdocs = ireader.termDocs(Term('summary', token.term()))
                 count = 0
                 span = 0
                 terms.append(token.term())
                 while termdocs.next():
                     count += termdocs.freq()
                     span += 1
                 weight = labels[i].label_weight
                 #if float(span)/ireader.numDocs() >= 0.18 and not re.search('a-zA-z', token.term()):
                     #weight = 0
                 labelmatrix[term_row[token.term()]][i] = weight
         label_term.append(terms)
     termmatrix = array(all)
     termmatrix = transpose(termmatrix)
     #for i in range(len(labelmatrix[0])):
         #for j in range(len(termmatrix[0])):
     
     # row是doc,col是label  
     #p = self.product(termmatrix,labelmatrix)
     d = dot(termmatrix, labelmatrix)
     result = d / (norm(labelmatrix) * norm(termmatrix))
     doc_label = []
     for i in range(len(result)):
         m = - 1
         index = - 1
         group = []
         for j in range(len(result[i])):
             if result[i][j] > 0:
                 labels[j].id = result[i][j]
                 group.append(labels[j])
         # substring是按照id来排序的,这里正好用到
         group.sort()
         group.reverse()
         max_label = group[0]
         # i:doc number(just occur position in the docs)
         # label id
         # label score
         # 如果label自身并没有出现在当前doc中
         if not max_label.doc_freq.has_key(i):
             #print 'oringial:',labels[index].text
             count = 0
             overlap = ''
             for k in label_term[index]:
                 if term_row.has_key(k) and termmatrix[i][term_row[k]] != 0:
                     overlap = k
                     print k
                     count += 1
             # 至少有一个交集,并且长度大于等于2
             if count == 1 and len(overlap) >= 2 :
                 new_label = pextractor.Substring()
                 new_label.text = overlap
                 new_label.id = m
                 doc_label.append(group[0])
                 continue
                     
         #labels[index].id = m
         doc_label.append(group[0])
     return doc_label
示例#4
0
文件: matrixmapper.py 项目: fay/wt
 def assign(self, docs, labels, lucene_ids):
     term_row = {}
     all = []
     ireader = IndexReader.open(STORE_DIR)
     total_terms = 0
     term_doc_freq = {}
     for i in range(len(lucene_ids)):
         tpv = TermPositionVector.cast_(ireader.getTermFreqVector(lucene_ids[i], 'summary'))
         self.add2matrix(tpv, all, term_row, lucene_ids, i,term_doc_freq)
         """
             TODO:给属于标题的term加权
         """
         tpv = TermPositionVector.cast_(ireader.getTermFreqVector(lucene_ids[i], 'title'))
         self.add2matrix(tpv, all, term_row, lucene_ids, i,term_doc_freq)
     #for k,v in term_doc_freq.items():
      #   if v> 3:
       #      print k,v
     # 对label进行分词            
     analyzer = CJKAnalyzer()
     labelmatrix = zeros((len(all), len(labels)))
     label_term = []
     # doc -label:每个doc对应的label
     all_weight_table = {}
     #label -doc:每个label对应的doc
     label_doc = []
     label_doc_map = {}
     for i in range(len(labels)):
         nonzero_table = []
         # 一个label对应和所有doc的权重之积
         weight_table = []
         
         stream = analyzer.tokenStream('', StringReader(labels[i].text))
         terms = []            
         c = 0
         weight_row = {}
         nonzero_index = []  
         is_incomplete = False
         for token in stream:
             term = token.term()#token.decode('utf-8')#
             #print term
             if term_row.has_key(term):
                 row = term_row[term]
                 terms.append(term)
                 docs_with_current_term = all[row]
                 for j in range(len(docs_with_current_term)):
                     if docs_with_current_term[j] != 0:                                            
                         if c == 0:
                             nonzero_index.append(j)
                         if c == 0 or j in nonzero_index:
                             weight_row[j] = weight_row.get(j, 0) + docs_with_current_term[j] * term_doc_freq[term] * labels[i].label_weight 
                         else:
                             # 加1防止权重之积为0
                             # 针对第一次出现在nonzero_index中而在后面的过程中没有出现的doc  ,乘以-100使得权重乘积最小表示当前label不适用于此doc                              
                             weight_row[j] = (1 + docs_with_current_term[j] * term_doc_freq[term] * labels[i].label_weight) * (- 100)
                     # 针对第一次没有在nonzero_index中而在后面的过程中出现的doc 
                     elif docs_with_current_term[j] == 0 and j in nonzero_index:
                         # 加1防止权重之积为0
                         weight_row[j] = (1 + docs_with_current_term[j] * labels[i].label_weight) * (- 100)
                 c += 1
             else:
                 is_incomplete = True
         label_term.append(terms)
         # bugfix:如果当前label经分词后,不是所有的term都在全部doc的term中,那么放弃当前label,舍之。
         if is_incomplete:
             weight_row = {}
                 
                 
         for doc, weight in weight_row.items():
             last = all_weight_table.get(doc)                
             if weight > 0:
                 if not label_doc_map.has_key(labels[i].text):    
                     kc = dao.get_keyword_category_by_category(self.query, labels[i].text)
                     #label_doc.append([ 0,labels[i].text,[]])
                     label_doc.append([ 0,labels[i].text,0])
                     label_doc_map[labels[i].text] = len(label_doc) - 1
                 new_label = pextractor.Substring()
                 new_label.text = labels[i].text
                 new_label.id = weight
                 if last:
                     all_weight_table[doc].append(new_label)
                 else:
                     all_weight_table[doc] = [new_label]
                 #label_doc[label_doc_map[labels[i].text]][2].append(doc)
                 label_doc[label_doc_map[labels[i].text]][2] += 1
                 label_doc[label_doc_map[labels[i].text]][0] += weight
                 
                 #try:
                  #   category = dao.save_category(labels[i].text, weight, 'd')
                   #  entry = self.entries[doc]
                    # ec = dao.save_entry_cat(entry, category, weight)
                 #except Exception,e:
                  #   print e
                 
                 #if last:
                  #   all_weight_table[doc].append(ec)
                 #else:
                  #   all_weight_table[doc] = [ec]
             # 如果doc已经存在,那么用已经存在的doc-label权重比较当前的权重,如果当前的更大则替换已经存在的,即选择最大权重的label
             #if last:
             #    if last.id < weight and weight > 0:
              #       labels[i].id = weight
               #      all_weight_table[doc] = labels[i]
             #else:
              #   labels[i].id = weight
               #  all_weight_table[doc] = labels[i]
     label_doc.sort(reverse=True)
     for k, v in all_weight_table.items():
         v.sort(reverse=True)
             
     # 因为map中键为连续的整数值,哈希算法会把他按从小到大的位置排放,所以直接返回的values是已经排好序的了
     thread = SaveLabelsThread(all_weight_table,label_doc,self.entries,self.query)
     thread.start()
     return all_weight_table,label_doc
示例#5
0
文件: cnanalyzer.py 项目: fay/wt
if __name__ == '__main__':
    analyzer = CJKAnalyzer()
    directory = RAMDirectory()
    ireader = IndexReader.open(STORE_DIR)
    iwriter = IndexWriter(directory, StandardAnalyzer(), True)
    ts = ["javasd。 $##open所大家教唆犯地方地方即可解放大家空间艰苦奋斗矿井口地方", "所看看对抗赛不久交会法觉得拮抗剂"]
    for t in ts:
        doc = Document()
        doc.add(Field("fieldname", t,
                      Field.Store.YES, Field.Index.TOKENIZED,
                      Field.TermVector.WITH_POSITIONS_OFFSETS))
        iwriter.addDocument(doc)
    iwriter.optimize()
    iwriter.close()
    ireader = IndexReader.open(directory)
    tpv = TermPositionVector.cast_(ireader.getTermFreqVector(0, 'fieldname'))
    
    for (t, f, i) in zip(tpv.getTerms(), tpv.getTermFrequencies(), xrange(100000)):
        print 'term %s' % t
        print '  freq: %i' % f
        try:
            print '  pos: ' + str([p for p in tpv.getTermPositions(i)])
        except:
            print '  no pos'
        try:
            print '  off: ' + \
                  str(["%i-%i" % (o.getStartOffset(), o.getEndOffset())
                       for o in tpv.getOffsets(i)])
        except:
            print '  no offsets'
    text = "地方库 fd###  fd 反对 发"