示例#1
0
def build_user_dict():
    """
        创建新的用户自定义词典
    """
    DIR = dirname(os.path.abspath(__file__))
    # 666
    filePermision = stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IROTH | stat.S_IWOTH
    userDictFile = os.path.join(DIR, "userdict.txt")

    useUserDefinedDict = getDBConfigure("USE_USER_DEFINED_DICT",
                                        default=0,
                                        type_=lambda x: bool(int(x)))
    if not useUserDefinedDict:
        nullUserDictFile = os.path.join(DIR, "nulluserdict.txt")
        if not os.path.exists(nullUserDictFile):
            with open(nullUserDictFile, "w+") as f:
                print "created nulluserdict.txt"
            os.chmod(nullUserDictFile, filePermision)
        print "USE_USER_DEFINED_DICT=OFF,use null dict instead"
        return nullUserDictFile

    needRebuildUserDefinedDict = getDBConfigure("RE_BUILD_USER_DEFINED_DICT",
                                                default=1,
                                                type_=lambda x: bool(int(x)))
    if not needRebuildUserDefinedDict:
        if not os.path.exists(userDictFile):
            with open(userDictFile, "w+") as f:
                print "File userdict.txt does not exist,create an empty one"
            os.chmod(userDictFile, filePermision)
        print "use the old userdict.txt file"
        return userDictFile

    ##
    # 需要重新制作字典
    ##
    setDBConfigure("RE_BUILD_USER_DEFINED_DICT", option=0)  # 不用再重新制作字典了,重置标志位
    if os.path.exists(userDictFile):
        os.remove(userDictFile)

    template_line = u"{word} {frequency} {characteristic}\n"
    with open(userDictFile, 'w') as f:
        wordCount = Vocabulary.objects.filter(brand="user").count()
        step = 300
        writted = 0
        while writted < wordCount:

            words = Vocabulary.objects.filter(brand="user")\
                              .values("word","frequency","characteristic")\
                              .all()[writted:writted+step]

            for word in words:
                f.write(template_line.format(**word).encode("utf-8"))

            writted = writted + step

    os.chmod(userDictFile, filePermision)
    print "rebuilt the userdict.txt"
    return userDictFile
示例#2
0
def newsvolume():
    """
        得到总数,并且缓存指定时间
    """
    if not cache.get("news_amount"):
        #print "not using cache"
        news_amount = News.objects.all().count()
        newsVolumeCountCacheLife = getDBConfigure("NEWS_COUNT_CACHE_LIFE",
                                                  default=4 * 60 * 40,
                                                  type_=int)
        cache.set('news_amount', news_amount, newsVolumeCountCacheLife)
    return cache.get("news_amount")
示例#3
0
def hotwords():
    #words_list = HotWords.read_most()
    if not cache.get("hot_words_list"):
        ##
        # get setting from db
        # RECOMM_CACHE_LIFE = 30
        ##

        RECOMM_CACHE_LIFE = getDBConfigure("RECOMM_CACHE_LIFE",
                                           default="30",
                                           type_=int)

        hot_words_list = HotWords.appear_most()
        cache.set("hot_words_list", hot_words_list, RECOMM_CACHE_LIFE)
    return {'hotwords': cache.get("hot_words_list")}
示例#4
0
def recommend_words():
    """
        同hotwords,只是使用的方式不同
    """
    if not cache.get("hot_words_list"):
        ##
        # get setting from db
        # RECOMM_CACHE_LIFE = 30
        ##

        RECOMM_CACHE_LIFE = getDBConfigure("RECOMM_CACHE_LIFE",
                                           default="30",
                                           type_=int)

        hot_words_list = HotWords.appear_most()
        cache.set("hot_words_list", hot_words_list, RECOMM_CACHE_LIFE)
    return cache.get("hot_words_list")
示例#5
0
    def appear_most(cls):
        """
            查找tilte当中出现最多的关键词
        """
        cls.init()
        ##
        # get configure from database
        #   
        #   RECOMM_DAYS = 2
        #   RECOMM_RANK_GT = 200
        #   RECOMM_NEWSLIMIT = RECOMM_DAYS * RECOMM_RANK_GT
        #   RECOMM_HALF_DESC = 30
        #   RECOMM_NUM = 8
        #   这里的配置不用设置缓存时间,每次直接实时读数据库就好
        ##
 
        RECOMM_DAYS = getDBConfigure("RECOMM_DAYS",default=2,type_=int)
        RECOMM_RANK_GT = getDBConfigure("RECOMM_RANK_GT",default=200,type_=int)

        #RECOMM_NEWSLIMIT = RECOMM_DAYS * RECOMM_RANK_GT
        RECOMM_NEWSLIMIT = getDBConfigure("RECOMM_NEWSLIMIT",default=400,type_=int)

        RECOMM_HALF_DESC = getDBConfigure("RECOMM_HALF_DESC",default="30.0",type_=float)
        RECOMM_NUM = getDBConfigure("RECOMM_NUM",default=8,type_=int)

        RECOMM_RECORD_HOT = getDBConfigure("RECOMM_RECORD_HOT",default=0,type_=lambda v:bool(int(v)))

        RECOMM_ALLOWED_WORD_TYPE_REGEX = getDBConfigure("RECOMM_ALLOWED_WORD_TYPE_REGEX",default="(nr|nz|nt)",type_=str)

        # 设置词性的正则表达式
        cls.regex_allowed = re.compile(RECOMM_ALLOWED_WORD_TYPE_REGEX)
        ############END SETTINGS FORM DATABASE#################
        today = datetime.date.today()
        oneday = datetime.timedelta(days=RECOMM_DAYS) 
        yesterday=today - oneday         
        recent_news = News.objects.filter(news_time__gte=yesterday,news_time__lte=today,rank__gt=(RANK_SORT_PARAMETER-RECOMM_RANK_GT))\
                                  .only("title")\
                                  .order_by("-news_time","-rank")[0:RECOMM_NEWSLIMIT]
        #tagList = []
        tagMap = dict()
        for news in recent_news.values("id","title","rank"):
            tags = jieba.posseg.cut(news["title"])
            tags = filter(cls.filter_out_short,tags)
            #tags = filter(cls.filter_out_verb,tags)
            tags = filter(cls.filter_out_number,tags)
            tags = filter(cls.filter_out_deny,tags)
            tags = filter(cls.filter_in_only,tags)
            #map(lambda t:cls.upsert(t,news.rank,tagMap),tags)
            for tag in tags:
                score = RECOMM_HALF_DESC/(RANK_SORT_PARAMETER-news["rank"]+int(RECOMM_HALF_DESC))
                key = tag.word

                if tagMap.has_key(key):
                    tagMap[key] = score + tagMap[key]
                else:
                    tagMap[key] = score  
                #print "[%d]the score for %s is %f" % (news.rank,key,tagMap[key])
        #counter = Counter(tagList)
        #print "the length of tagList is %d" % len(tagList)
        hot_word_tube = sorted(tagMap.items(),key=lambda x:x[1],reverse=True)
        #hot_word_tube = counter.most_common(8)
        #hot_words = [ str(x[0])+"/"+str(x[1]) for x in hot_word_tube]
        hot_words = [ x[0] for x in hot_word_tube[0:RECOMM_NUM]]

        # 根据是否开启热词记录
        if RECOMM_RECORD_HOT:
            for i,w in enumerate(hot_word_tube[0:RECOMM_NUM]):
                try:
                    hotword = HotWordTrace.objects.only("word")\
                                          .get(time=datetime.date.today(),word=w[0])
                except HotWordTrace.DoesNotExist:
                    note="using %s/(1+%s)" % (RECOMM_HALF_DESC,RECOMM_HALF_DESC) # for debug popurse
                    hotword = HotWordTrace(word=w[0],rank=i+1,score=w[1],note=note)
                    hotword.save()
                except HotWordTrace.MultipleObjectsReturned:
                    hotword = HotWordTrace.objects.only("id")\
                                          .filter(time=datetime.date.today(),word=w[0])
                    theSameId = [x[0] for x in hotword.values_list("id")[1:]]
                    HotWordTrace.objects.filter(id__in=theSameId).delete()
                    
        return hot_words