예제 #1
0
def removeNonExistChannel():
    channel_to_remove = set()
    for key, title in maintext.items():
        if not key.endswith('/0'):
            continue
        if not title:
            channel_to_remove.add(key.split('/')[0])
    items = [(item[0], item[0].split('/')[0].lower())
             for item in maintext.items()]
    bucket = createBucket(items)
    for channel_lower in bucket:
        items = bucket[channel_lower]
        items = [(item, item.split('/')[0]) for item in items]
        sub_bucket = createBucket(items)
        sub_bucket = [(len(item[1]), item[0]) for item in sub_bucket.items()]
        sub_bucket.sort(reverse=True)
        for _, channel in sub_bucket[1:]:
            channel_to_remove.add(channel)
    print('remove channel count', len(channel_to_remove))
    count = 0
    for key, _ in maintext.items():
        if key.split('/')[0] in channel_to_remove:
            dbase.removeKey(key)
            count += 1
    print('remove non exist', count)
    for channel in channel_to_remove:
        channels._db.items.pop(channel, None)
예제 #2
0
def cleanupNoMain():
	count = 0
	for key, text in index.items():
		if not maintext.get(key):
			count += 1
			dbase.removeKey(key)
	print('cleanupNoMain', count)
예제 #3
0
def cleanKeys(keys, limit):
    count = 0
    key_score = [(getScore(key), key) for key in keys]
    key_score.sort()
    for score, key in key_score[limit:]:
        dbase.removeKey(key)
        count += 1
    return count
예제 #4
0
def cleanupOldOrBad(keys):
    keys.sort()
    keys = keys[1:-1]  # leave the last one
    count = 0
    rest = []
    for key in keys:
        if shouldRemove(key):
            dbase.removeKey(key)
            count += 1
        else:
            rest.append(key)
    if len(rest) < 300:
        return count
    retain_len = getRetainLen(rest[0].split('/')[0])
    for key in rest[:-retain_len]:
        dbase.removeKey(key)
        count += 1
    return count
예제 #5
0
def cleanupChannel(keys, keepChinese=True):
	if not keys or len(keys) <= 100:
		return 0 
	if keepChinese:
		result_keys = []
		for key in keys:
			if not isSimplified(index.get(key)):
				result_keys.append(key)
		keys = result_keys
	if len(keys) <= 50:
		return 0
	sort_keys = [(getKeyScore(key), key) for key in keys]
	sort_keys.sort(reverse=True)
	count = 0
	for key in sort_keys[50:]:
		dbase.removeKey(key[1])
		count += 1
	return count
예제 #6
0
def slowBackfill(channel):
    post_id = _findLastMessage(channel)
    findNew = False
    for _ in range(getMaxIteration(channel)):
        post_id -= 1
        if post_id <= 1:
            break
        key = channel + '/' + str(post_id)
        if index.get(key):
            post_id -= int(random.random() * 100)
            continue
        post = webgram.getPost(channel, post_id)
        if post.getIndex():
            findNew = True
            dbase.update(post)
        elif findNew:
            dbase.removeKey(key)
        if postTooOld(post):
            break
예제 #7
0
def cleanupRedundant():
	bucket = {}
	for key, text in maintext.items():
		if key.endswith('/0'):
			continue
		text = text[:10]
		if text in bucket:
			bucket[text].append(key)
		else:
			bucket[text] = [key]
	print('cleanup1 1', len(bucket.items()))
	count = 0
	for text, keys in bucket.items():
		key_score = [(getScore(key), key) for key in keys]
		key_score.sort()
		for score, key in key_score[1:]:
			dbase.removeKey(key)
			count += 1
		if key_score[0][0] == 1:
			dbase.removeKey(key_score[0][1])
	print('cleanupRedundant', count)