Пример #1
0
 def set(self, txt, po_id):
     feature_list = feature_md5(txt)
     for feature in feature_list:
         key = feature
         entry = self.get(key)
         if not entry:
             val = array('L', [po_id])
             if not self.db.set(key, val.tostring()):
                 print >> sys.stderr, 'open error: ' + str(self.db.error())
         else:
             val = array('L')
             val.fromstring(entry)
             if po_id not in val:
                 val.append(po_id)
                 self.db.set(key, val.tostring())
             return val
Пример #2
0
 def set(self, txt, po_id):
     feature_list = feature_md5(txt)
     for feature in feature_list:
         key = feature
         entry = self.get(key)
         if not entry:
             val = array('L', [po_id])
             if not self.db.set(key, val.tostring()):
                 print >> sys.stderr, 'open error: ' + str(self.db.error())
         else:
             val = array('L')
             val.fromstring(entry)
             if po_id not in val:
                 val.append(po_id)
                 self.db.set(key, val.tostring())
             return val
Пример #3
0
    def txt_is_duplicate(self, txt):
        feature_list = feature_md5(txt)

        if not feature_list:
            return []

        #print feature_list
        feature_list_len = float(len(feature_list))

        min_same_count = min(int(feature_list_len*0.618)+1, feature_list_len)

        result = []
        for id, same_count in self.__find_duplicate__(feature_list).iteritems():
            #print same_count
            if same_count >= min_same_count:
                result.append((id, same_count/feature_list_len))

        return result
Пример #4
0
    def txt_is_duplicate(self, txt):
        feature_list = feature_md5(txt)

        if not feature_list:
            return []

        #print feature_list
        feature_list_len = float(len(feature_list))

        min_same_count = min(
            int(feature_list_len * 0.618) + 1, feature_list_len)

        result = []
        for id, same_count in self.__find_duplicate__(
                feature_list).iteritems():
            #print same_count
            if same_count >= min_same_count:
                result.append((id, same_count / feature_list_len))

        return result