def _tag_rate(self): import math total = int(cache.get(TRAIN_TOTAL)) file_path = os.path.join(DATA_DIR, 'tag_rate_train.dic') file = open(file_path, 'w') keys = cache.keys('%s*' %TRAIN_TAG_PREFIX) for key in keys: name = key[len(TRAIN_TAG_PREFIX):] value = float(cache.get(key)) log = math.log10(total/value) line = '%s\t%.3f\n' %(name.decode('utf-8'), log) file.write(line.encode('utf-8')) file.close()
def _tag_relations(self, prefix, func=None): def default(a): return a if not func: func = default keys = cache.keys('%s*' %prefix) file_path = os.path.join(DATA_DIR, '%s_%s.dic' %(prefix.replace(':', '_').lower(), func.__name__)) file = open(file_path, 'w') for key in keys: items = cache.zrevrangebyscore(name=key, min='-inf', max='+inf', withscores=True) name = key[len(prefix):] total = 0 for tag, value in items: score = cache.get(name=cache_key(TRAIN_TAG_PREFIX, tag.decode('utf-8'))) total += value/ float(func(score)) objs = [] for tag, value in items: score = cache.get(name=cache_key(TRAIN_TAG_PREFIX, tag.decode('utf-8'))) percentage = value/float(func(score))/total objs.append((tag, percentage)) def cmp(a, b): return int(a[1]*1000) - int(b[1]*1000) objs = sorted(objs, cmp=cmp, reverse=True) values = ','.join(['%s__%.4f'%(obj[0].decode('utf-8'), obj[1]) for obj in objs]) line = '%s\t%s\n' %(name.decode('utf-8'), values) file.write(line.encode('utf-8')) file.close()