예제 #1
0
파일: matrixmapper.py 프로젝트: fay/wt
 def assign(self, docs, labels, lucene_ids):
     term_row = {}
     all = []
     ireader = IndexReader.open(STORE_DIR)
     total_terms = 0
     term_doc_freq = {}
     for i in range(len(lucene_ids)):
         tpv = TermPositionVector.cast_(ireader.getTermFreqVector(lucene_ids[i], 'summary'))
         self.add2matrix(tpv, all, term_row, lucene_ids, i,term_doc_freq)
         """
             TODO:给属于标题的term加权
         """
         tpv = TermPositionVector.cast_(ireader.getTermFreqVector(lucene_ids[i], 'title'))
         self.add2matrix(tpv, all, term_row, lucene_ids, i,term_doc_freq)
     #for k,v in term_doc_freq.items():
      #   if v> 3:
       #      print k,v
     # 对label进行分词            
     analyzer = CJKAnalyzer()
     labelmatrix = zeros((len(all), len(labels)))
     label_term = []
     # doc -label:每个doc对应的label
     all_weight_table = {}
     #label -doc:每个label对应的doc
     label_doc = []
     label_doc_map = {}
     for i in range(len(labels)):
         nonzero_table = []
         # 一个label对应和所有doc的权重之积
         weight_table = []
         
         stream = analyzer.tokenStream('', StringReader(labels[i].text))
         terms = []            
         c = 0
         weight_row = {}
         nonzero_index = []  
         is_incomplete = False
         for token in stream:
             term = token.term()#token.decode('utf-8')#
             #print term
             if term_row.has_key(term):
                 row = term_row[term]
                 terms.append(term)
                 docs_with_current_term = all[row]
                 for j in range(len(docs_with_current_term)):
                     if docs_with_current_term[j] != 0:                                            
                         if c == 0:
                             nonzero_index.append(j)
                         if c == 0 or j in nonzero_index:
                             weight_row[j] = weight_row.get(j, 0) + docs_with_current_term[j] * term_doc_freq[term] * labels[i].label_weight 
                         else:
                             # 加1防止权重之积为0
                             # 针对第一次出现在nonzero_index中而在后面的过程中没有出现的doc  ,乘以-100使得权重乘积最小表示当前label不适用于此doc                              
                             weight_row[j] = (1 + docs_with_current_term[j] * term_doc_freq[term] * labels[i].label_weight) * (- 100)
                     # 针对第一次没有在nonzero_index中而在后面的过程中出现的doc 
                     elif docs_with_current_term[j] == 0 and j in nonzero_index:
                         # 加1防止权重之积为0
                         weight_row[j] = (1 + docs_with_current_term[j] * labels[i].label_weight) * (- 100)
                 c += 1
             else:
                 is_incomplete = True
         label_term.append(terms)
         # bugfix:如果当前label经分词后,不是所有的term都在全部doc的term中,那么放弃当前label,舍之。
         if is_incomplete:
             weight_row = {}
                 
                 
         for doc, weight in weight_row.items():
             last = all_weight_table.get(doc)                
             if weight > 0:
                 if not label_doc_map.has_key(labels[i].text):    
                     kc = dao.get_keyword_category_by_category(self.query, labels[i].text)
                     #label_doc.append([ 0,labels[i].text,[]])
                     label_doc.append([ 0,labels[i].text,0])
                     label_doc_map[labels[i].text] = len(label_doc) - 1
                 new_label = pextractor.Substring()
                 new_label.text = labels[i].text
                 new_label.id = weight
                 if last:
                     all_weight_table[doc].append(new_label)
                 else:
                     all_weight_table[doc] = [new_label]
                 #label_doc[label_doc_map[labels[i].text]][2].append(doc)
                 label_doc[label_doc_map[labels[i].text]][2] += 1
                 label_doc[label_doc_map[labels[i].text]][0] += weight
                 
                 #try:
                  #   category = dao.save_category(labels[i].text, weight, 'd')
                   #  entry = self.entries[doc]
                    # ec = dao.save_entry_cat(entry, category, weight)
                 #except Exception,e:
                  #   print e
                 
                 #if last:
                  #   all_weight_table[doc].append(ec)
                 #else:
                  #   all_weight_table[doc] = [ec]
             # 如果doc已经存在,那么用已经存在的doc-label权重比较当前的权重,如果当前的更大则替换已经存在的,即选择最大权重的label
             #if last:
             #    if last.id < weight and weight > 0:
              #       labels[i].id = weight
               #      all_weight_table[doc] = labels[i]
             #else:
              #   labels[i].id = weight
               #  all_weight_table[doc] = labels[i]
     label_doc.sort(reverse=True)
     for k, v in all_weight_table.items():
         v.sort(reverse=True)
             
     # 因为map中键为连续的整数值,哈希算法会把他按从小到大的位置排放,所以直接返回的values是已经排好序的了
     thread = SaveLabelsThread(all_weight_table,label_doc,self.entries,self.query)
     thread.start()
     return all_weight_table,label_doc
예제 #2
0
파일: matrixmapper.py 프로젝트: fay/wt
 def label_assign(self, docs, labels, lucene_ids):
     term_row = {}
     all = []
     ireader = IndexReader.open(STORE_DIR)
     total_terms = 0
     for i in range(len(lucene_ids)):
         tpv = TermPositionVector.cast_(ireader.getTermFreqVector(lucene_ids[i], 'summary'))
         self.add2matrix(tpv, all, term_row, lucene_ids, i)
         tpv = TermPositionVector.cast_(ireader.getTermFreqVector(lucene_ids[i], 'title'))
         self.add2matrix(tpv, all, term_row, lucene_ids, i)
     
     # 对label进行分词            
     analyzer = CJKAnalyzer()
     labelmatrix = zeros((len(all), len(labels)))
     label_term = []
     for i in range(len(labels)):
         if not labels[i].is_candicate_label and len(labels[i].text) >= 3:
             label_term.append([])
             continue
         #print labels[i].text,labels[i].id
         stream = analyzer.tokenStream('', StringReader(labels[i].text))
         terms = []
         for token in stream:
             if term_row.has_key(token.term()):
                 # weighting
                 termdocs = ireader.termDocs(Term('summary', token.term()))
                 count = 0
                 span = 0
                 terms.append(token.term())
                 while termdocs.next():
                     count += termdocs.freq()
                     span += 1
                 weight = labels[i].label_weight
                 #if float(span)/ireader.numDocs() >= 0.18 and not re.search('a-zA-z', token.term()):
                     #weight = 0
                 labelmatrix[term_row[token.term()]][i] = weight
         label_term.append(terms)
     termmatrix = array(all)
     termmatrix = transpose(termmatrix)
     #for i in range(len(labelmatrix[0])):
         #for j in range(len(termmatrix[0])):
     
     # row是doc,col是label  
     #p = self.product(termmatrix,labelmatrix)
     d = dot(termmatrix, labelmatrix)
     result = d / (norm(labelmatrix) * norm(termmatrix))
     doc_label = []
     for i in range(len(result)):
         m = - 1
         index = - 1
         group = []
         for j in range(len(result[i])):
             if result[i][j] > 0:
                 labels[j].id = result[i][j]
                 group.append(labels[j])
         # substring是按照id来排序的,这里正好用到
         group.sort()
         group.reverse()
         max_label = group[0]
         # i:doc number(just occur position in the docs)
         # label id
         # label score
         # 如果label自身并没有出现在当前doc中
         if not max_label.doc_freq.has_key(i):
             #print 'oringial:',labels[index].text
             count = 0
             overlap = ''
             for k in label_term[index]:
                 if term_row.has_key(k) and termmatrix[i][term_row[k]] != 0:
                     overlap = k
                     print k
                     count += 1
             # 至少有一个交集,并且长度大于等于2
             if count == 1 and len(overlap) >= 2 :
                 new_label = pextractor.Substring()
                 new_label.text = overlap
                 new_label.id = m
                 doc_label.append(group[0])
                 continue
                     
         #labels[index].id = m
         doc_label.append(group[0])
     return doc_label
예제 #3
0
파일: cnanalyzer.py 프로젝트: fay/wt
    
    for (t, f, i) in zip(tpv.getTerms(), tpv.getTermFrequencies(), xrange(100000)):
        print 'term %s' % t
        print '  freq: %i' % f
        try:
            print '  pos: ' + str([p for p in tpv.getTermPositions(i)])
        except:
            print '  no pos'
        try:
            print '  off: ' + \
                  str(["%i-%i" % (o.getStartOffset(), o.getEndOffset())
                       for o in tpv.getOffsets(i)])
        except:
            print '  no offsets'
    text = "地方库 fd###  fd 反对 发"
    stream = analyzer.tokenStream("fieldname", StringReader(text))
    for s in stream:
        print s
    print dir(analyzer)
    
    
def b():
    from dot.searcher import Searcher, STORE_DIR
    from apps.wantown import dao
    from apps.wantown.models import Entry
    queries = ['sms','algorithm','java','google','mac','apple','淘宝','阿里巴巴','云计算','python','java google']
    searcher = Searcher()
    import datetime
    #fsock = open(str(datetime.datetime.now()),'w')
    for query in queries[:5]:
        hits = searcher.search(query)