Пример #1
0
  def _tag_rate(self):
    import math

    total = int(cache.get(TRAIN_TOTAL))
    file_path = os.path.join(DATA_DIR, 'tag_rate_train.dic')
    file = open(file_path, 'w')

    keys = cache.keys('%s*' %TRAIN_TAG_PREFIX)
    for key in keys:
      name = key[len(TRAIN_TAG_PREFIX):]
      value = float(cache.get(key))
      log = math.log10(total/value)

      line = '%s\t%.3f\n' %(name.decode('utf-8'), log)
      file.write(line.encode('utf-8'))

    file.close()
Пример #2
0
  def _tag_relations(self, prefix, func=None):
    def default(a):
      return a
    
    if not func:
      func = default

    keys = cache.keys('%s*' %prefix)
    file_path = os.path.join(DATA_DIR, '%s_%s.dic'
                %(prefix.replace(':', '_').lower(), func.__name__))

    file = open(file_path, 'w')
    for key in keys:
      items = cache.zrevrangebyscore(name=key, min='-inf', max='+inf', withscores=True)
      name = key[len(prefix):]
      total = 0

      for tag, value in items:
        score = cache.get(name=cache_key(TRAIN_TAG_PREFIX, tag.decode('utf-8')))
        total += value/ float(func(score))

      objs = []
      for tag, value in items:
        score = cache.get(name=cache_key(TRAIN_TAG_PREFIX, tag.decode('utf-8')))
        percentage = value/float(func(score))/total
        objs.append((tag, percentage))

      def cmp(a, b):
        return int(a[1]*1000) - int(b[1]*1000)

      objs = sorted(objs, cmp=cmp, reverse=True)

      values = ','.join(['%s__%.4f'%(obj[0].decode('utf-8'), obj[1]) for obj in objs])

      line = '%s\t%s\n' %(name.decode('utf-8'), values)
      file.write(line.encode('utf-8'))

    file.close()