예제 #1
0
def tag_to_id(tag):
    tag = str(tag)
    id = id2topic.get(tag, 0)
    if id in ID2MY:
        id = ID2MY[id]
        if id not in myidset:
            id = 0
    else:
        id = 0

    if not id:
        tag = name_tidy(tag)
        id = NAME2ID.get(tag, 0)
    return id
def tag_to_id(tag):
    tag = str(tag)
    id = id2topic.get(tag, 0)
    if id in ID2MY:
        id = ID2MY[id]
        if id not in myidset:
            id = 0
    else:
        id = 0

    if not id:
        tag = name_tidy(tag)
        id = NAME2ID.get(tag, 0)
    return id
예제 #3
0
def merge():
    topic_count = defaultdict(int)

    f = "word2count.txt"

    keys = redis.keys("*")
    for pos, key in enumerate(keys):
        l = redis.hgetall(key)
        print "1",pos, key
        for k,v in l.iteritems():
            topic_count[int(k)]+=int(v)

    #word_topic_freq = defaultdict(list)

    with open("word_tf.txt", "w") as word_freq:
        for pos, word in enumerate(keys):
            tf = []
            l = redis.hgetall(word)
            for topic, freq in l.iteritems():
                topic = int(topic)
                count = topic_count[topic]
                if count < 10000:
                    continue
                freq = int(freq)*500000/count
                if freq > 0:
                    tf.append((topic, freq))

            fcount = sum(i[1] for i in tf)

            tf = dict(tf)
            id = NAME2ID.get(name_tidy(word), 0)
            if id:
                t = tf.get(id,0)
                diff = fcount - t
                tf[id] = fcount
                fcount += diff

            if not fcount:
                continue

            t = []
            for topic, f in tf.iteritems():
                rank = int(f*10000/fcount)
                if rank:
                    t.append((topic, rank))
            if t:
                word_freq.write(
                    dumps([word, t])+"\n"
                )
예제 #4
0
def merge():
    topic_count = defaultdict(int)

    f = "word2count.txt"

    keys = redis.keys("*")
    for pos, key in enumerate(keys):
        l = redis.hgetall(key)
        print "1", pos, key
        for k, v in l.iteritems():
            topic_count[int(k)] += int(v)

    #word_topic_freq = defaultdict(list)

    with open("word_tf.txt", "w") as word_freq:
        for pos, word in enumerate(keys):
            tf = []
            l = redis.hgetall(word)
            for topic, freq in l.iteritems():
                topic = int(topic)
                count = topic_count[topic]
                if count < 10000:
                    continue
                freq = int(freq) * 500000 / count
                if freq > 0:
                    tf.append((topic, freq))

            fcount = sum(i[1] for i in tf)

            tf = dict(tf)
            id = NAME2ID.get(name_tidy(word), 0)
            if id:
                t = tf.get(id, 0)
                diff = fcount - t
                tf[id] = fcount
                fcount += diff

            if not fcount:
                continue

            t = []
            for topic, f in tf.iteritems():
                rank = int(f * 10000 / fcount)
                if rank:
                    t.append((topic, rank))
            if t:
                word_freq.write(dumps([word, t]) + "\n")