def _createConferenceObject(self, request):
        """Create or update Conference object, returning ConferenceForm/request."""
        # preload necessary data items
        user = endpoints.get_current_user()
        if not user:
            raise endpoints.UnauthorizedException('Authorization required')
        user_id = util.getUserId(user)

        if not request.name:
            raise endpoints.BadRequestException("Conference 'name' field required")

        # copy ConferenceForm/ProtoRPC Message into dict
        data = {field.name: getattr(request, field.name) for field in request.all_fields()}
        del data['websafeKey']
        del data['organizerDisplayName']

        # add default values for those missing (both data model & outbound Message)
        for df in DEFAULTS:
            if data[df] in (None, []):
                data[df] = DEFAULTS[df]
                setattr(request, df, DEFAULTS[df])

        # convert dates from strings to Date objects; set month based on start_date
        if data['startDate']:
            data['startDate'] = datetime.strptime(data['startDate'][:10], "%Y-%m-%d").date()
            data['month'] = data['startDate'].month
        else:
            data['month'] = 0
        if data['endDate']:
            data['endDate'] = datetime.strptime(data['endDate'][:10], "%Y-%m-%d").date()

        # set seatsAvailable to be same as maxAttendees on creation
        # both for data model & outbound Message
        if data["maxAttendees"] > 0:
            data["seatsAvailable"] = data["maxAttendees"]
            setattr(request, "seatsAvailable", data["maxAttendees"])

        # make Profile Key from user ID
        p_key = ndb.Key(Profile, user_id)
        # allocate new Conference ID with Profile key as parent
        c_id = Conference.allocate_ids(size=1, parent=p_key)[0]
        # make Conference key from ID
        c_key = ndb.Key(Conference, c_id, parent=p_key)
        data['key'] = c_key
        data['organizerUserId'] = request.organizerUserId = user_id

        # create Conference & return (modified) ConferenceForm
        Conference(**data).put()

        return request
    def getConferencesCreated(self, request):
        """Return conferences created by user."""
        # make sure user is authed
        user = endpoints.get_current_user()
        if not user:
            raise endpoints.UnauthorizedException('Authorization required')

        # make profile key
        p_key = ndb.Key(Profile, util.getUserId(user))
        # create ancestor query for this user
        conferences = Conference.query(ancestor=p_key)
        # get the user profile and display name
        prof = p_key.get()
        displayName = getattr(prof, 'displayName')
        # return set of ConferenceForm objects per Conference
        return ConferenceForms(
            items=[self._copyConferenceToForm(conf, displayName) for conf in conferences]
        )
    def _getProfileFromUser(self):
        """Return user Profile from datastore, creating new one if non-existent."""

        user = endpoints.get_current_user()
        if not user:
            raise endpoints.UnauthorizedException('Authorization required')
        user_id = util.getUserId(user)
        p_key = ndb.Key(Profile, user_id)
        profile = p_key.get()

        if not profile:
            profile = Profile(
                key = p_key,
                displayName = user.nickname(),
                mainEmail= user.email(),
                teeShirtSize = str(TeeShirtSize.NOT_SPECIFIED),
            )
            profile.put()
        return profile      # return Profile
示例#4
0
def ldaa(dic, corpus, tfidf, bContentList, newUniBContentListString, topicNum):
    corpus_tfidf = tfidf[corpus]
    print '------------------type(corpus_tfidf):', type(corpus_tfidf)
    # for i in corpus_tfidf:
    #     print i

    lda = models.LdaModel(corpus_tfidf, id2word=dic, num_topics=topicNum)
    ldaOut = lda.print_topics(topicNum)

    li = 5
    vec = [(0, 1), (4, 1)]
    vec = dic.doc2bow(bContentList[li])

    # get similarity matrix of len(bContentList) * len(bContentList)
    index = similarities.MatrixSimilarity(lda[corpus])
    simMatrix = []

    # get the Similarity Matrix(eg: 100 * 100) of all barrages,
    for bIndex in range(len(bContentList)):
        vec = bContentList[bIndex]
        vec_bow = dic.doc2bow(bContentList[bIndex])
        vec_lda = lda[vec_bow]
        sims = index[vec_lda]

        # print list(enumerate(sims))

        # sorted with similarity from high to low
        # sims = sorted(enumerate(sims), key=lambda item: -item[1])
        # print sims, len(sims), type(sims)

        simMatrix.append(list(enumerate(sims)))

    # eg: simMatrix[1] = [(0, 0.91061151), (1, 0.99999994), (2, 0.99999994), (3, 0.99999994), (4, 0.73748994), (5, 0.81874228)......]
    # print len(simMatrix), simMatrix[1]
    # sys.exit()

    # print all lda topics words
    # such as:
    # 0.002*"合影" + 0.002*"钱" + 0.002*"撒花" + 0.002*"没" + 0.002*"完结" + 0.002*"看" + 0.002*"啊" + 0.002*"之" + 0.002*"湫" + 0.002*"一个"
    # 0.002*"买" + 0.002*"第一次" + 0.002*"支持" + 0.002*"啊" + 0.002*"没" + 0.002*"完结" + 0.002*"湫" + 0.002*"国漫" + 0.002*"撒花" + 0.002*"b"
    # 0.004*"第一次" + 0.003*"湫" + 0.003*"合影" + 0.003*"在" + 0.003*"存活" + 0.003*"买" + 0.003*"确认" + 0.003*"啊" + 0.003*"椿" + 0.002*"撒花"
    # 0.003*"完结" + 0.003*"撒花" + 0.003*"钱" + 0.003*"合影" + 0.002*"再见" + 0.002*"没" + 0.002*"啊" + 0.002*"湫" + 0.002*"好" + 0.001*"第一次"
    # 0.003*"存活" + 0.003*"确认" + 0.002*"合影" + 0.002*"没" + 0.002*"钱" + 0.002*"秋水共长天一色" + 0.002*"第一次" + 0.001*"靠" + 0.001*"也" + 0.001*"生日"
    for i in ldaOut:
        r = i[1].encode('utf-8')
        print r

    for i in ldaOut:
        r = i[1].encode('utf-8')
        print 'Topic', ldaOut.index(i), ':',
        util.printList(rmNum(r))

    # sys.exit()

    print type(ldaOut[0])
    print type(ldaOut[0][0])

    corpus_lda = lda[corpus_tfidf]
    resList = []
    iii = 0

    # eg: doc [(0, 0.033333333333334041), (1, 0.033333333333659149), (2, 0.03333333333337106), (3, 0.033333333333336511), (4, 0.033333333333333631), (5, 0.033333333577374141), (6, 0.033333333333333381), (7, 0.53333330176939997), (8, 0.033333333641347308), (9, 0.033333333333333388), (10, 0.033333333333333409), (11, 0.033333358397907714), (12, 0.033333333333333381), (13, 0.033333333333333368), (14, 0.033333339280269603)]
    for doc in corpus_lda:
        # eg: res = (3, 0.72867093662442284), res has 72% posibility to be in type 3
        res = getMax(doc)
        resList.append(res)
    print '---type(corpus_tfidf), type(corpus_lda)', type(corpus_tfidf), type(
        corpus_lda)
    print '---len(resList)', len(resList)

    # len = topicNum
    simMatrixTopicList = []
    for topicId in range(topicNum):
        simMatrixTopic = [
            i for i in range(len(resList)) if resList[i][0] == topicId
        ]
        print topicId, 'topic has:', len(simMatrixTopic), 'barrage'
        simMatrixTopicList.append(simMatrixTopic)
        # print len(simMatrixTopic), simMatrixTopic

    # without square
    # # inner distance
    # # sum of all similarity of i'th row
    # iRow = 0.0
    # num = 0
    # innDisMatrix = [0.0 for i in range(topicNum)]
    # for topicId in range(topicNum):
    #     for i in range(len(simMatrixTopicList[topicId])-1):
    #         for j in range(i+1, len(simMatrixTopicList[topicId])):
    #             # print simMatrix[simMatrixTopicList[topicId][i]][simMatrixTopicList[topicId][j]][1]
    #             iRow += simMatrix[simMatrixTopicList[topicId][i]][simMatrixTopicList[topicId][j]][1]
    #         # print topicId, 'topic, num:', num
    #     lenOfIRow = len(simMatrixTopicList[topicId])
    #     numOfIRow = (1 + lenOfIRow - 1) * (lenOfIRow - 1) / 2
    #     innDisMatrix[topicId] = iRow/numOfIRow
    #     iRow = 0.0
    # print 'inner distance:', innDisMatrix
    #
    # aveInnDis = sum(innDisMatrix) / len(innDisMatrix)
    # print 'average inner distance:', aveInnDis
    #
    # # external distance
    # cols = topicNum
    # rows = topicNum
    # extDisMatrix = [[0.0 for col in range(cols)] for row in range(rows)]
    # iRow = 0.0
    # for topicId in range(topicNum):
    #     for ti2 in range(topicId+1, topicNum):
    #         for i in range(len(simMatrixTopicList[topicId])):
    #             for j in range(len(simMatrixTopicList[ti2])):
    #                 iRow += simMatrix[simMatrixTopicList[topicId][i]][simMatrixTopicList[ti2][j]][1]
    #             # iRow += iRow
    #         # print iRow
    #         lenOfIRow = len(simMatrixTopicList[topicId]) * len(simMatrixTopicList[ti2])
    #         extDisMatrix[topicId][ti2] = iRow / float(lenOfIRow)
    #         iRow = 0.0
    #
    # print 'external distance:', extDisMatrix
    #
    # totExtDis = 0
    # aveExtDis = 0
    # num = 0
    # for i in extDisMatrix:
    #     for j in i:
    #         if j != 0:
    #             totExtDis += j
    #             num += 1
    # aveExtDis = totExtDis / float(num)
    #
    # print 'average external distance:', aveExtDis
    # print 'inner/external value:', aveInnDis/aveExtDis

    # within square(**2)

    # inner distance
    # sum of all similarity of i'th row
    iRow = 0.0
    num = 0
    innDisMatrix = [0.0 for i in range(topicNum)]

    # innDisMatrixNum[0]: the number of similarity value every topic
    innDisMatrixNum = [0.0 for i in range(topicNum)]
    for topicId in range(topicNum):
        for i in range(len(simMatrixTopicList[topicId]) - 1):
            for j in range(i + 1, len(simMatrixTopicList[topicId])):
                iRow += (simMatrix[simMatrixTopicList[topicId][i]][
                    simMatrixTopicList[topicId][j]][1])**2
            # print topicId, 'topic, num:', num
        lenOfIRow = len(simMatrixTopicList[topicId])
        numOfIRow = (1 + lenOfIRow - 1) * (lenOfIRow - 1) / 2
        innDisMatrix[topicId] = iRow / numOfIRow
        # innDisMatrixNum[topicId] = numOfIRow
        iRow = 0.0
    print 'inner distance:', innDisMatrix

    aveInnDis = 1 / (sum(innDisMatrix) / topicNum)
    print 'average inner distance:', aveInnDis

    # external distance
    cols = topicNum
    rows = topicNum
    extDisMatrix = [[0.0 for col in range(cols)] for row in range(rows)]

    # extDisMatrixNum[0]: the number of similarity value every topic
    # extDisMatrixNum = [[0.0 for col in range(cols)] for row in range(rows)]
    iRow = 0.0
    # countt = 0
    for topicId in range(topicNum):
        for ti2 in range(topicId + 1, topicNum):
            for i in range(len(simMatrixTopicList[topicId])):
                for j in range(len(simMatrixTopicList[ti2])):
                    iRow += (simMatrix[simMatrixTopicList[topicId][i]][
                        simMatrixTopicList[ti2][j]][1])**2
                    # countt += 1
            # print iRow
            iRowNum = len(simMatrixTopicList[topicId]) * len(
                simMatrixTopicList[ti2])
            # print 'iRowNum:', iRowNum, 'countt:', countt
            extDisMatrix[topicId][ti2] = iRow / iRowNum
            iRow = 0.0
            # countt = 0

    print 'external distance:', extDisMatrix

    totExtDis = 0
    aveExtDis = 0

    for i in extDisMatrix:
        for j in i:
            totExtDis += j
    extNoneZeroNum = (1 + cols - 1) * (cols - 1) / 2

    aveExtDis = 1 / (totExtDis / extNoneZeroNum)

    print 'average external distance:', aveExtDis
    print 'inner/external value:', aveInnDis / aveExtDis

    # return aveInnDis, aveExtDis

    # sys.exit()

    # topic possibility distribution in user profile
    topicPosi = []
    for topicId in range(topicNum):
        posiList = [i[1] for i in resList if i[0] == topicId]

        # average accuracy rate
        possi = sum(posiList) / len(posiList)
        topicPosi.append(possi)

    # sys.exit()

    fullPath = os.getcwd()

    # concatenate full path
    userCodeIdListFilePath = fullPath + '/data/users/' + vCid + '/userIdList.txt'
    userCodeIdList = util.getUserCodeIdList(userCodeIdListFilePath)
    # for i in userCodeIdList:
    #     print i

    favTagTlist = util.getFilesOfDir(vCid)

    # concatenate full path
    favTagTlist = [
        fullPath + '/data/users/' + vCid + '/' + tagT for tagT in favTagTlist
    ]
    for i in favTagTlist:
        print i
    tagMatrix, tagVNumMatrix, userList, catAll = clustering.scanAllTags(
        favTagTlist)

    catNum = len(catAll)

    # eg: topicDist =
    # [[125.  126.   83.   18.  121.   44.   72.    0.  108.  113.   46.   66.  114.    0.  109.],
    # [ 799.  785.  558.  141.  737.  286.  425.    2.  659.  611.  376.  460.  765.    0.  657.],
    # [ 308.  321.  238.   48.  272.  116.  162.    0.  259.  236.  135.  173.  284.    1.  267.],
    # [ 557.  540.  378.   99.  490.  215.  315.    0.  457.  424.  232.  295.  514.    0.  449.],
    # [ 537.  535.  361.   86.  477.  176.  293.    0.  463.  416.  234.  297.  509.    0.  444.]]
    # 音乐 动画 番剧 广告 电影 时尚 舞蹈 公告 游戏 鬼畜 娱乐 电视剧 生活 活动 科技
    topicDist = np.zeros((topicNum, catNum))
    # every percentage of topic
    topicPercDist = np.zeros((topicNum, catNum))

    topicDistNoneNum = np.zeros(topicNum)
    userIdNoneNum = 0

    # topic index list: [0, 1, 2, 3, 4]
    topicNumList = range(topicNum)

    # a list of: all users' barrage data of a topic
    aTopicNewUniBContentListString = []
    topicUserNumList = []

    for i in topicNumList:
        aTopicNewUniBContentListString.append([])
        topicUserNumList.append(0)

    # used to calculate the number of topicPercDist added
    topicPercDistAddNum = np.zeros(topicNum)
    for i in range(len(resList)):
        # print i
        topicId = resList[i][0]
        if topicId in topicNumList:
            userId = util.getUserId(newUniBContentListString[i][0],
                                    userCodeIdList)
            # print newUniBContentListString[i][0], userId
            aTopicNewUniBContentListString[topicId].append(
                newUniBContentListString[i])
            topicUserNumList[topicId] += 1

            if userId is not None:
                # print userId, favTagTlist, type(favTagTlist)
                res = clustering.getTagLineOfUser(tagMatrix, tagVNumMatrix,
                                                  userList, userId)
                if res is not None:
                    tagLineOfUI, tagVNumLineOfUI = res
                # userId is not in the list
                else:
                    continue

                if tagLineOfUI is not None:
                    # print len(tagLineOfUI), userId, tagLineOfUI
                    topicDist[topicId] += tagLineOfUI

                    # the perc distribution of tagVideo number of a user
                    tagVPercLineOfUI = np.around(
                        tagVNumLineOfUI / float(sum(tagVNumLineOfUI)), 3)
                    topicPercDist[topicId] += tagVPercLineOfUI
                    topicPercDistAddNum[topicId] += 1
                else:
                    topicDistNoneNum += 1
            else:
                userIdNoneNum += 1

    topicWordBListL = []
    # topicWordBListLSec = []
    topicWordBListL2 = []
    topicWordBListL3 = []

    # print Top 10 frequent words in a topic & the barrageList of the topic, in one time
    for i in topicNumList:
        bContentList2 = xmlDL.divideSent(aTopicNewUniBContentListString[i], 0)
        wordList2 = getMostWord(bContentList2, 20)
        print '------------topic', i, ':',
        for j in wordList2:
            print j[0],
        print

    top10weightActor = []
    top10weightNoActor = []

    # print Top 10 frequent words in a topic & the barrageList of the topic
    for i in topicNumList:
        print '------------topic', i, '-------------users:', topicUserNumList
        # divideSent(,0) no actors' name
        bContentList = xmlDL.divideSent(aTopicNewUniBContentListString[i], 0)
        wordList = getMostWord(bContentList, 20)
        for j in wordList:
            print j[0], j[1]

        for j in aTopicNewUniBContentListString[i]:
            # eg: abb5230a 417.671 灵尊:哟,火柴棍

            if wordList[0][0].encode('utf-8') in j[2]:
                util.printList(j)
        print 'wordList[0][0]', wordList[0][0]
        # the index list of all barrage in topic i which contains the 1st frequent word in topic i
        topicIWord1BList = [
            j[1] for j in aTopicNewUniBContentListString[i]
            if wordList[0][0].encode('utf-8') in j[2]
        ]
        topicWordBListL.append(topicIWord1BList)

        # second word
        # topicIWord1BListSec = [j[1] for j in aTopicNewUniBContentListString[i] if wordList[1][0].encode('utf-8') in j[2]]
        # topicWordBListLSec.append(topicIWord1BListSec)

        # 0.002*"合影" + 0.002*"钱" + 0.002*"撒花" + 0.002*"没" + 0.002*"完结" + 0.002*"看" + 0.002*"啊" + 0.002*"之" + 0.002*"湫" + 0.002*"一个"
        # "合影" + "钱" + "撒花" + "没" + .....
        wordList2 = rmNum(ldaOut[i][1].encode('utf-8'))

        # get the most weight word in wordList2(after deleting actors' name)
        # eg: wordList2 = '''0.440*"小凡" + 0.030*"鲸鱼" + 0.018*"上线" + 0.014*"灰" + 0.013*"套路" + 0.012*"官方" + 0.010*"小痴" + 0.009*"滴血" + 0.009*"姐姐" + 0.009*"嘴"'''
        # firstWord = '上线', (default firstWord is '小凡')
        firstWord = wordList2[0]

        for word in wordList2:
            # if '小葵' in wordList2:
            #     firstWord = '小葵'
            #     top10weightNoActor.append('小葵')
            #     break
            if word in util.getTxtList(
                    'data/stopWord/jiebaNewWord_Qingyunzhi.txt'):
                continue
            else:
                firstWord = word
                top10weightNoActor.append(word)
                break
        top10weightActor.append(wordList2[0])
        # the index list of all barrage in topic i which contains the 1st frequent word(with weight) in topic i
        topicIWord1BList2 = [
            j[1] for j in aTopicNewUniBContentListString[i]
            if firstWord in j[2]
        ]
        topicWordBListL2.append(topicIWord1BList2)

        # weight, with actors' name
        topicIWord1BList3 = [
            j[1] for j in aTopicNewUniBContentListString[i]
            if wordList2[0] in j[2]
        ]
        topicWordBListL3.append(topicIWord1BList3)

    print 'top 10 weight word with actor:',
    for i in top10weightActor:
        print i,
    print

    print 'top 10 weight word without actor:',
    for i in top10weightNoActor:
        print i,
    print

    plt.figure(1)
    plt.subplot(211)
    topicWordBListL = topicWordBListL3
    for i in topicWordBListL:
        y = [topicWordBListL.index(i) for indexx in range(len(i))]
        plt.scatter(i, y, marker='.', color='b')
        # print 'len(i), len(y):', len(i), len(y), i, y
    plt.plot([], marker='.', color='b', label='Most frequent words')
    plt.xlim(0, )
    plt.legend()
    plt.xlabel('Barrage Time(s)')
    plt.ylabel('Topic ID')

    plt.subplot(212)
    for i in topicWordBListL2:
        y = [topicWordBListL2.index(i) for indexx in range(len(i))]
        plt.scatter(i, y, marker='x', color='r')
    plt.plot([], marker='x', color='r', label='Most weight words')
    plt.xlim(0, )
    plt.legend()
    plt.xlabel('Barrage Time(s)')
    plt.ylabel('Topic ID')

    # plt.show()

    print 'the num of users of different topics:', topicUserNumList
    print 'the num of users who is not in userCodeIdList:', userIdNoneNum
    print '-------------------------'

    # print topicDist

    topicDist2 = np.sum(topicDist, axis=1)
    # the percentage of tag of a user
    topicDist3 = np.transpose(np.transpose(topicDist) / np.float16(topicDist2))
    print topicDist3, '\n'

    for i in range(len(topicPercDist)):
        for j in range(len(topicPercDist[i])):
            topicPercDist[i][j] = topicPercDist[i][j] / topicPercDistAddNum[i]
    # topicPercDist = topicPercDist/topicPercDistAddNum
    print 'topicPercDist, topicPercDist[1][1]:', topicPercDist, topicPercDist[
        1][1]
    np.savetxt('topicDist.txt', topicDist)
    np.savetxt('topicPercDist.txt', topicPercDist)

    colorList = ['b', 'c', 'g', 'k', 'm', 'r', 'y']
    plt.figure(5)
    for i in range(len(topicPercDist)):
        plt.plot(topicPercDist[i], colorList[i])
    plt.xlabel = u'用户收藏视频主题类型'
    plt.ylabel = u'用户收藏视频主题占比(各主题视频数量/所有主题视频数量)'

    plt.show()

    print topicNum, "topics possibility average value:", topicPosi, sum(
        topicPosi) / len(topicPosi)

    resIndexList = [i[0] for i in resList]
    resTypeList = list(set(resIndexList))
    # number of type index
    resTCountList = []
    # barrage sequence number of type index
    indexList = []

    for index in resTypeList:
        resTCountList.append(resIndexList.count(index))
        indexList.append([i for i, v in enumerate(resIndexList) if v == index])

    # print resTCountList
    # print indexList[0]

    # type 0
    print 'all barrage comments of type 000000000000000------------'
    # for i in indexList[0]:
    #     print uniBContentList[i][-1]

    plt.xlabel("Barrage Type")
    plt.ylabel("Barrage Number")
    plt.plot(resTCountList, 'r-')

    # plt.show()

    # return 1/aveInnDis, 1/aveExtDis
    return aveInnDis, aveExtDis