def set(self, txt, po_id): feature_list = feature_md5(txt) for feature in feature_list: key = feature entry = self.get(key) if not entry: val = array('L', [po_id]) if not self.db.set(key, val.tostring()): print >> sys.stderr, 'open error: ' + str(self.db.error()) else: val = array('L') val.fromstring(entry) if po_id not in val: val.append(po_id) self.db.set(key, val.tostring()) return val
def txt_is_duplicate(self, txt): feature_list = feature_md5(txt) if not feature_list: return [] #print feature_list feature_list_len = float(len(feature_list)) min_same_count = min(int(feature_list_len*0.618)+1, feature_list_len) result = [] for id, same_count in self.__find_duplicate__(feature_list).iteritems(): #print same_count if same_count >= min_same_count: result.append((id, same_count/feature_list_len)) return result
def txt_is_duplicate(self, txt): feature_list = feature_md5(txt) if not feature_list: return [] #print feature_list feature_list_len = float(len(feature_list)) min_same_count = min( int(feature_list_len * 0.618) + 1, feature_list_len) result = [] for id, same_count in self.__find_duplicate__( feature_list).iteritems(): #print same_count if same_count >= min_same_count: result.append((id, same_count / feature_list_len)) return result