예제 #1
0
def tag_id_rank_list_by_txt(txt):
    txt = txt.lower()
    tag_id_list_rank = defaultdict(int)
    for word, rank in tf_idf_seg_txt(txt):
        #print word
        ars = db_tag_bayes.get(word)
        if ars:
            ar = array('I')
            ar.fromstring(ars)
            #print len(ar)
            #print db_tag_bayes[word]
            #print word, ar
            for tag_id, bayes in chunkiter(ar, 2):
                tag_id_list_rank[tag_id] += (bayes * rank)

    result = []

    for tag_id, rank in sorted(tag_id_list_rank.iteritems(),
                               key=itemgetter(1),
                               reverse=True):
        has_tag = False

        if tag_id not in ID2NAME:
            continue

        for i in ID2NAME[tag_id]:
            if has_tag:
                break

            tag_list = list(sp_txt(i))

            if tag_list:
                for j in tag_list:
                    #print j, str(j) in txt
                    if str(j) in txt:
                        has_tag = True
                        break
            elif i in txt:
                has_tag = True
                break

        if has_tag:
            result.append((tag_id, rank))

    return result
예제 #2
0
def tag_id_rank_list_by_txt(txt):
    txt = txt.lower()
    tag_id_list_rank = defaultdict(int)
    for word, rank in tf_idf_seg_txt(txt):
        # print word
        ars = db_tag_bayes.get(word)
        if ars:
            ar = array("I")
            ar.fromstring(ars)
            # print len(ar)
            # print db_tag_bayes[word]
            # print word, ar
            for tag_id, bayes in chunkiter(ar, 2):
                tag_id_list_rank[tag_id] += bayes * rank

    result = []

    for tag_id, rank in sorted(tag_id_list_rank.iteritems(), key=itemgetter(1), reverse=True):
        has_tag = False

        if tag_id not in ID2NAME:
            continue

        for i in ID2NAME[tag_id]:
            if has_tag:
                break

            tag_list = list(sp_txt(i))

            if tag_list:
                for j in tag_list:
                    # print j, str(j) in txt
                    if str(j) in txt:
                        has_tag = True
                        break
            elif i in txt:
                has_tag = True
                break

        if has_tag:
            result.append((tag_id, rank))

    return result
예제 #3
0
#coding:utf-8
import _env
from name2id import NAME2ID
from zkit.txt_cleanup import sp_txt
from collections import defaultdict
from zkit.pprint import pprint

sp2id = defaultdict(list)

for k, v in NAME2ID.iteritems():
    for i in sp_txt(k):
        sp2id[i].append(k)

word_parent = defaultdict(set)

for k, v in NAME2ID.iteritems():
    for i in sp_txt(k):
        for j in sp2id[i]:
            if j != k and k in j:
                #print k, j
                word_parent[NAME2ID[j]].add(NAME2ID[k])

id2name = dict((k, v) for v, k in NAME2ID.iteritems())

#for id, pid_list in word_parent.iteritems():
#    print id2name[id]
#    for i in pid_list:
#        print id2name[i],
#    print "\n" 

word_parent = dict((k, tuple(v)) for k, v in word_parent.iteritems())
예제 #4
0
#coding:utf-8
import _env
from name2id import NAME2ID
from zkit.txt_cleanup import sp_txt
from collections import defaultdict
from zkit.pprint import pprint

sp2id = defaultdict(list)

for k, v in NAME2ID.iteritems():
    for i in sp_txt(k):
        sp2id[i].append(k)

word_parent = defaultdict(set)

for k, v in NAME2ID.iteritems():
    for i in sp_txt(k):
        for j in sp2id[i]:
            if j != k and k in j:
                #print k, j
                word_parent[NAME2ID[j]].add(NAME2ID[k])

id2name = dict((k, v) for v, k in NAME2ID.iteritems())

#for id, pid_list in word_parent.iteritems():
#    print id2name[id]
#    for i in pid_list:
#        print id2name[i],
#    print "\n"

word_parent = dict((k, tuple(v)) for k, v in word_parent.iteritems())