def main(): t = {} for k, v in NAME2ID.iteritems(): t[name_tidy(k)] = v print "#coding: utf-8" print "NAME2ID = ", pprint(t)
def main(): t = {} for k,v in NAME2ID.iteritems(): t[name_tidy(k)] = v print "#coding: utf-8" print "NAME2ID = ", pprint(t)
def tag_to_id(tag): tag = str(tag) id = id2topic.get(tag, 0) if id in ID2MY: id = ID2MY[id] if id not in myidset: id = 0 else: id = 0 if not id: tag = name_tidy(tag) id = NAME2ID.get(tag, 0) return id
def merge(): topic_count = defaultdict(int) f = "word2count.txt" keys = redis.keys("*") for pos, key in enumerate(keys): l = redis.hgetall(key) print "1",pos, key for k,v in l.iteritems(): topic_count[int(k)]+=int(v) #word_topic_freq = defaultdict(list) with open("word_tf.txt", "w") as word_freq: for pos, word in enumerate(keys): tf = [] l = redis.hgetall(word) for topic, freq in l.iteritems(): topic = int(topic) count = topic_count[topic] if count < 10000: continue freq = int(freq)*500000/count if freq > 0: tf.append((topic, freq)) fcount = sum(i[1] for i in tf) tf = dict(tf) id = NAME2ID.get(name_tidy(word), 0) if id: t = tf.get(id,0) diff = fcount - t tf[id] = fcount fcount += diff if not fcount: continue t = [] for topic, f in tf.iteritems(): rank = int(f*10000/fcount) if rank: t.append((topic, rank)) if t: word_freq.write( dumps([word, t])+"\n" )
def merge(): topic_count = defaultdict(int) f = "word2count.txt" keys = redis.keys("*") for pos, key in enumerate(keys): l = redis.hgetall(key) print "1", pos, key for k, v in l.iteritems(): topic_count[int(k)] += int(v) #word_topic_freq = defaultdict(list) with open("word_tf.txt", "w") as word_freq: for pos, word in enumerate(keys): tf = [] l = redis.hgetall(word) for topic, freq in l.iteritems(): topic = int(topic) count = topic_count[topic] if count < 10000: continue freq = int(freq) * 500000 / count if freq > 0: tf.append((topic, freq)) fcount = sum(i[1] for i in tf) tf = dict(tf) id = NAME2ID.get(name_tidy(word), 0) if id: t = tf.get(id, 0) diff = fcount - t tf[id] = fcount fcount += diff if not fcount: continue t = [] for topic, f in tf.iteritems(): rank = int(f * 10000 / fcount) if rank: t.append((topic, rank)) if t: word_freq.write(dumps([word, t]) + "\n")
#coding:utf-8 import _env from json import loads from zhihu_topic_data_with_follow import ZHIHU_TOPIC from name2id import NAME2ID from zdata.tag.name_tidy import name_tidy from zhihu_topic_url2id import ID2MY from itertools import chain from zhihu_question_load import zhihu_to_dump id2topic = dict([(i[1], i[0]) for i in ZHIHU_TOPIC]) myidset = set(NAME2ID.itervalues()) myiddict = dict([(k, v) for v, k in NAME2ID.iteritems()]) def tag_id_list_by_str_list(tags): tag_list = [] for tag in tags: id = tag_to_id(tag) if not id: continue else: tag_list.append(id) return tag_list def tag_to_id(tag):
#coding:utf-8 import _env from name2id import NAME2ID from zkit.txt_cleanup import sp_txt from collections import defaultdict from zkit.pprint import pprint sp2id = defaultdict(list) for k, v in NAME2ID.iteritems(): for i in sp_txt(k): sp2id[i].append(k) word_parent = defaultdict(set) for k, v in NAME2ID.iteritems(): for i in sp_txt(k): for j in sp2id[i]: if j != k and k in j: #print k, j word_parent[NAME2ID[j]].add(NAME2ID[k]) id2name = dict((k, v) for v, k in NAME2ID.iteritems()) #for id, pid_list in word_parent.iteritems(): # print id2name[id] # for i in pid_list: # print id2name[i], # print "\n" word_parent = dict((k, tuple(v)) for k, v in word_parent.iteritems())
#coding:utf-8 import _env from json import loads from zhihu_topic_data_with_follow import ZHIHU_TOPIC from name2id import NAME2ID from zdata.tag.name_tidy import name_tidy from zhihu_topic_url2id import ID2MY from itertools import chain from zhihu_question_load import zhihu_to_dump id2topic = dict([(i[1], i[0]) for i in ZHIHU_TOPIC]) myidset = set(NAME2ID.itervalues()) myiddict = dict([(k, v) for v, k in NAME2ID.iteritems()]) def tag_id_list_by_str_list(tags): tag_list = [] for tag in tags: id = tag_to_id(tag) if not id: continue else: tag_list.append(id) return tag_list def tag_to_id(tag): tag = str(tag) id = id2topic.get(tag, 0)
def main(): for k, v in NAME2ID.iteritems(): alias, id = k, v # autocomplete_tag.append(alias, id) tag_alias_new(alias=alias, id=id)