def _createConferenceObject(self, request): """Create or update Conference object, returning ConferenceForm/request.""" # preload necessary data items user = endpoints.get_current_user() if not user: raise endpoints.UnauthorizedException('Authorization required') user_id = util.getUserId(user) if not request.name: raise endpoints.BadRequestException("Conference 'name' field required") # copy ConferenceForm/ProtoRPC Message into dict data = {field.name: getattr(request, field.name) for field in request.all_fields()} del data['websafeKey'] del data['organizerDisplayName'] # add default values for those missing (both data model & outbound Message) for df in DEFAULTS: if data[df] in (None, []): data[df] = DEFAULTS[df] setattr(request, df, DEFAULTS[df]) # convert dates from strings to Date objects; set month based on start_date if data['startDate']: data['startDate'] = datetime.strptime(data['startDate'][:10], "%Y-%m-%d").date() data['month'] = data['startDate'].month else: data['month'] = 0 if data['endDate']: data['endDate'] = datetime.strptime(data['endDate'][:10], "%Y-%m-%d").date() # set seatsAvailable to be same as maxAttendees on creation # both for data model & outbound Message if data["maxAttendees"] > 0: data["seatsAvailable"] = data["maxAttendees"] setattr(request, "seatsAvailable", data["maxAttendees"]) # make Profile Key from user ID p_key = ndb.Key(Profile, user_id) # allocate new Conference ID with Profile key as parent c_id = Conference.allocate_ids(size=1, parent=p_key)[0] # make Conference key from ID c_key = ndb.Key(Conference, c_id, parent=p_key) data['key'] = c_key data['organizerUserId'] = request.organizerUserId = user_id # create Conference & return (modified) ConferenceForm Conference(**data).put() return request
def getConferencesCreated(self, request): """Return conferences created by user.""" # make sure user is authed user = endpoints.get_current_user() if not user: raise endpoints.UnauthorizedException('Authorization required') # make profile key p_key = ndb.Key(Profile, util.getUserId(user)) # create ancestor query for this user conferences = Conference.query(ancestor=p_key) # get the user profile and display name prof = p_key.get() displayName = getattr(prof, 'displayName') # return set of ConferenceForm objects per Conference return ConferenceForms( items=[self._copyConferenceToForm(conf, displayName) for conf in conferences] )
def _getProfileFromUser(self): """Return user Profile from datastore, creating new one if non-existent.""" user = endpoints.get_current_user() if not user: raise endpoints.UnauthorizedException('Authorization required') user_id = util.getUserId(user) p_key = ndb.Key(Profile, user_id) profile = p_key.get() if not profile: profile = Profile( key = p_key, displayName = user.nickname(), mainEmail= user.email(), teeShirtSize = str(TeeShirtSize.NOT_SPECIFIED), ) profile.put() return profile # return Profile
def ldaa(dic, corpus, tfidf, bContentList, newUniBContentListString, topicNum): corpus_tfidf = tfidf[corpus] print '------------------type(corpus_tfidf):', type(corpus_tfidf) # for i in corpus_tfidf: # print i lda = models.LdaModel(corpus_tfidf, id2word=dic, num_topics=topicNum) ldaOut = lda.print_topics(topicNum) li = 5 vec = [(0, 1), (4, 1)] vec = dic.doc2bow(bContentList[li]) # get similarity matrix of len(bContentList) * len(bContentList) index = similarities.MatrixSimilarity(lda[corpus]) simMatrix = [] # get the Similarity Matrix(eg: 100 * 100) of all barrages, for bIndex in range(len(bContentList)): vec = bContentList[bIndex] vec_bow = dic.doc2bow(bContentList[bIndex]) vec_lda = lda[vec_bow] sims = index[vec_lda] # print list(enumerate(sims)) # sorted with similarity from high to low # sims = sorted(enumerate(sims), key=lambda item: -item[1]) # print sims, len(sims), type(sims) simMatrix.append(list(enumerate(sims))) # eg: simMatrix[1] = [(0, 0.91061151), (1, 0.99999994), (2, 0.99999994), (3, 0.99999994), (4, 0.73748994), (5, 0.81874228)......] # print len(simMatrix), simMatrix[1] # sys.exit() # print all lda topics words # such as: # 0.002*"合影" + 0.002*"钱" + 0.002*"撒花" + 0.002*"没" + 0.002*"完结" + 0.002*"看" + 0.002*"啊" + 0.002*"之" + 0.002*"湫" + 0.002*"一个" # 0.002*"买" + 0.002*"第一次" + 0.002*"支持" + 0.002*"啊" + 0.002*"没" + 0.002*"完结" + 0.002*"湫" + 0.002*"国漫" + 0.002*"撒花" + 0.002*"b" # 0.004*"第一次" + 0.003*"湫" + 0.003*"合影" + 0.003*"在" + 0.003*"存活" + 0.003*"买" + 0.003*"确认" + 0.003*"啊" + 0.003*"椿" + 0.002*"撒花" # 0.003*"完结" + 0.003*"撒花" + 0.003*"钱" + 0.003*"合影" + 0.002*"再见" + 0.002*"没" + 0.002*"啊" + 0.002*"湫" + 0.002*"好" + 0.001*"第一次" # 0.003*"存活" + 0.003*"确认" + 0.002*"合影" + 0.002*"没" + 0.002*"钱" + 0.002*"秋水共长天一色" + 0.002*"第一次" + 0.001*"靠" + 0.001*"也" + 0.001*"生日" for i in ldaOut: r = i[1].encode('utf-8') print r for i in ldaOut: r = i[1].encode('utf-8') print 'Topic', ldaOut.index(i), ':', util.printList(rmNum(r)) # sys.exit() print type(ldaOut[0]) print type(ldaOut[0][0]) corpus_lda = lda[corpus_tfidf] resList = [] iii = 0 # eg: doc [(0, 0.033333333333334041), (1, 0.033333333333659149), (2, 0.03333333333337106), (3, 0.033333333333336511), (4, 0.033333333333333631), (5, 0.033333333577374141), (6, 0.033333333333333381), (7, 0.53333330176939997), (8, 0.033333333641347308), (9, 0.033333333333333388), (10, 0.033333333333333409), (11, 0.033333358397907714), (12, 0.033333333333333381), (13, 0.033333333333333368), (14, 0.033333339280269603)] for doc in corpus_lda: # eg: res = (3, 0.72867093662442284), res has 72% posibility to be in type 3 res = getMax(doc) resList.append(res) print '---type(corpus_tfidf), type(corpus_lda)', type(corpus_tfidf), type( corpus_lda) print '---len(resList)', len(resList) # len = topicNum simMatrixTopicList = [] for topicId in range(topicNum): simMatrixTopic = [ i for i in range(len(resList)) if resList[i][0] == topicId ] print topicId, 'topic has:', len(simMatrixTopic), 'barrage' simMatrixTopicList.append(simMatrixTopic) # print len(simMatrixTopic), simMatrixTopic # without square # # inner distance # # sum of all similarity of i'th row # iRow = 0.0 # num = 0 # innDisMatrix = [0.0 for i in range(topicNum)] # for topicId in range(topicNum): # for i in range(len(simMatrixTopicList[topicId])-1): # for j in range(i+1, len(simMatrixTopicList[topicId])): # # print simMatrix[simMatrixTopicList[topicId][i]][simMatrixTopicList[topicId][j]][1] # iRow += simMatrix[simMatrixTopicList[topicId][i]][simMatrixTopicList[topicId][j]][1] # # print topicId, 'topic, num:', num # lenOfIRow = len(simMatrixTopicList[topicId]) # numOfIRow = (1 + lenOfIRow - 1) * (lenOfIRow - 1) / 2 # innDisMatrix[topicId] = iRow/numOfIRow # iRow = 0.0 # print 'inner distance:', innDisMatrix # # aveInnDis = sum(innDisMatrix) / len(innDisMatrix) # print 'average inner distance:', aveInnDis # # # external distance # cols = topicNum # rows = topicNum # extDisMatrix = [[0.0 for col in range(cols)] for row in range(rows)] # iRow = 0.0 # for topicId in range(topicNum): # for ti2 in range(topicId+1, topicNum): # for i in range(len(simMatrixTopicList[topicId])): # for j in range(len(simMatrixTopicList[ti2])): # iRow += simMatrix[simMatrixTopicList[topicId][i]][simMatrixTopicList[ti2][j]][1] # # iRow += iRow # # print iRow # lenOfIRow = len(simMatrixTopicList[topicId]) * len(simMatrixTopicList[ti2]) # extDisMatrix[topicId][ti2] = iRow / float(lenOfIRow) # iRow = 0.0 # # print 'external distance:', extDisMatrix # # totExtDis = 0 # aveExtDis = 0 # num = 0 # for i in extDisMatrix: # for j in i: # if j != 0: # totExtDis += j # num += 1 # aveExtDis = totExtDis / float(num) # # print 'average external distance:', aveExtDis # print 'inner/external value:', aveInnDis/aveExtDis # within square(**2) # inner distance # sum of all similarity of i'th row iRow = 0.0 num = 0 innDisMatrix = [0.0 for i in range(topicNum)] # innDisMatrixNum[0]: the number of similarity value every topic innDisMatrixNum = [0.0 for i in range(topicNum)] for topicId in range(topicNum): for i in range(len(simMatrixTopicList[topicId]) - 1): for j in range(i + 1, len(simMatrixTopicList[topicId])): iRow += (simMatrix[simMatrixTopicList[topicId][i]][ simMatrixTopicList[topicId][j]][1])**2 # print topicId, 'topic, num:', num lenOfIRow = len(simMatrixTopicList[topicId]) numOfIRow = (1 + lenOfIRow - 1) * (lenOfIRow - 1) / 2 innDisMatrix[topicId] = iRow / numOfIRow # innDisMatrixNum[topicId] = numOfIRow iRow = 0.0 print 'inner distance:', innDisMatrix aveInnDis = 1 / (sum(innDisMatrix) / topicNum) print 'average inner distance:', aveInnDis # external distance cols = topicNum rows = topicNum extDisMatrix = [[0.0 for col in range(cols)] for row in range(rows)] # extDisMatrixNum[0]: the number of similarity value every topic # extDisMatrixNum = [[0.0 for col in range(cols)] for row in range(rows)] iRow = 0.0 # countt = 0 for topicId in range(topicNum): for ti2 in range(topicId + 1, topicNum): for i in range(len(simMatrixTopicList[topicId])): for j in range(len(simMatrixTopicList[ti2])): iRow += (simMatrix[simMatrixTopicList[topicId][i]][ simMatrixTopicList[ti2][j]][1])**2 # countt += 1 # print iRow iRowNum = len(simMatrixTopicList[topicId]) * len( simMatrixTopicList[ti2]) # print 'iRowNum:', iRowNum, 'countt:', countt extDisMatrix[topicId][ti2] = iRow / iRowNum iRow = 0.0 # countt = 0 print 'external distance:', extDisMatrix totExtDis = 0 aveExtDis = 0 for i in extDisMatrix: for j in i: totExtDis += j extNoneZeroNum = (1 + cols - 1) * (cols - 1) / 2 aveExtDis = 1 / (totExtDis / extNoneZeroNum) print 'average external distance:', aveExtDis print 'inner/external value:', aveInnDis / aveExtDis # return aveInnDis, aveExtDis # sys.exit() # topic possibility distribution in user profile topicPosi = [] for topicId in range(topicNum): posiList = [i[1] for i in resList if i[0] == topicId] # average accuracy rate possi = sum(posiList) / len(posiList) topicPosi.append(possi) # sys.exit() fullPath = os.getcwd() # concatenate full path userCodeIdListFilePath = fullPath + '/data/users/' + vCid + '/userIdList.txt' userCodeIdList = util.getUserCodeIdList(userCodeIdListFilePath) # for i in userCodeIdList: # print i favTagTlist = util.getFilesOfDir(vCid) # concatenate full path favTagTlist = [ fullPath + '/data/users/' + vCid + '/' + tagT for tagT in favTagTlist ] for i in favTagTlist: print i tagMatrix, tagVNumMatrix, userList, catAll = clustering.scanAllTags( favTagTlist) catNum = len(catAll) # eg: topicDist = # [[125. 126. 83. 18. 121. 44. 72. 0. 108. 113. 46. 66. 114. 0. 109.], # [ 799. 785. 558. 141. 737. 286. 425. 2. 659. 611. 376. 460. 765. 0. 657.], # [ 308. 321. 238. 48. 272. 116. 162. 0. 259. 236. 135. 173. 284. 1. 267.], # [ 557. 540. 378. 99. 490. 215. 315. 0. 457. 424. 232. 295. 514. 0. 449.], # [ 537. 535. 361. 86. 477. 176. 293. 0. 463. 416. 234. 297. 509. 0. 444.]] # 音乐 动画 番剧 广告 电影 时尚 舞蹈 公告 游戏 鬼畜 娱乐 电视剧 生活 活动 科技 topicDist = np.zeros((topicNum, catNum)) # every percentage of topic topicPercDist = np.zeros((topicNum, catNum)) topicDistNoneNum = np.zeros(topicNum) userIdNoneNum = 0 # topic index list: [0, 1, 2, 3, 4] topicNumList = range(topicNum) # a list of: all users' barrage data of a topic aTopicNewUniBContentListString = [] topicUserNumList = [] for i in topicNumList: aTopicNewUniBContentListString.append([]) topicUserNumList.append(0) # used to calculate the number of topicPercDist added topicPercDistAddNum = np.zeros(topicNum) for i in range(len(resList)): # print i topicId = resList[i][0] if topicId in topicNumList: userId = util.getUserId(newUniBContentListString[i][0], userCodeIdList) # print newUniBContentListString[i][0], userId aTopicNewUniBContentListString[topicId].append( newUniBContentListString[i]) topicUserNumList[topicId] += 1 if userId is not None: # print userId, favTagTlist, type(favTagTlist) res = clustering.getTagLineOfUser(tagMatrix, tagVNumMatrix, userList, userId) if res is not None: tagLineOfUI, tagVNumLineOfUI = res # userId is not in the list else: continue if tagLineOfUI is not None: # print len(tagLineOfUI), userId, tagLineOfUI topicDist[topicId] += tagLineOfUI # the perc distribution of tagVideo number of a user tagVPercLineOfUI = np.around( tagVNumLineOfUI / float(sum(tagVNumLineOfUI)), 3) topicPercDist[topicId] += tagVPercLineOfUI topicPercDistAddNum[topicId] += 1 else: topicDistNoneNum += 1 else: userIdNoneNum += 1 topicWordBListL = [] # topicWordBListLSec = [] topicWordBListL2 = [] topicWordBListL3 = [] # print Top 10 frequent words in a topic & the barrageList of the topic, in one time for i in topicNumList: bContentList2 = xmlDL.divideSent(aTopicNewUniBContentListString[i], 0) wordList2 = getMostWord(bContentList2, 20) print '------------topic', i, ':', for j in wordList2: print j[0], print top10weightActor = [] top10weightNoActor = [] # print Top 10 frequent words in a topic & the barrageList of the topic for i in topicNumList: print '------------topic', i, '-------------users:', topicUserNumList # divideSent(,0) no actors' name bContentList = xmlDL.divideSent(aTopicNewUniBContentListString[i], 0) wordList = getMostWord(bContentList, 20) for j in wordList: print j[0], j[1] for j in aTopicNewUniBContentListString[i]: # eg: abb5230a 417.671 灵尊:哟,火柴棍 if wordList[0][0].encode('utf-8') in j[2]: util.printList(j) print 'wordList[0][0]', wordList[0][0] # the index list of all barrage in topic i which contains the 1st frequent word in topic i topicIWord1BList = [ j[1] for j in aTopicNewUniBContentListString[i] if wordList[0][0].encode('utf-8') in j[2] ] topicWordBListL.append(topicIWord1BList) # second word # topicIWord1BListSec = [j[1] for j in aTopicNewUniBContentListString[i] if wordList[1][0].encode('utf-8') in j[2]] # topicWordBListLSec.append(topicIWord1BListSec) # 0.002*"合影" + 0.002*"钱" + 0.002*"撒花" + 0.002*"没" + 0.002*"完结" + 0.002*"看" + 0.002*"啊" + 0.002*"之" + 0.002*"湫" + 0.002*"一个" # "合影" + "钱" + "撒花" + "没" + ..... wordList2 = rmNum(ldaOut[i][1].encode('utf-8')) # get the most weight word in wordList2(after deleting actors' name) # eg: wordList2 = '''0.440*"小凡" + 0.030*"鲸鱼" + 0.018*"上线" + 0.014*"灰" + 0.013*"套路" + 0.012*"官方" + 0.010*"小痴" + 0.009*"滴血" + 0.009*"姐姐" + 0.009*"嘴"''' # firstWord = '上线', (default firstWord is '小凡') firstWord = wordList2[0] for word in wordList2: # if '小葵' in wordList2: # firstWord = '小葵' # top10weightNoActor.append('小葵') # break if word in util.getTxtList( 'data/stopWord/jiebaNewWord_Qingyunzhi.txt'): continue else: firstWord = word top10weightNoActor.append(word) break top10weightActor.append(wordList2[0]) # the index list of all barrage in topic i which contains the 1st frequent word(with weight) in topic i topicIWord1BList2 = [ j[1] for j in aTopicNewUniBContentListString[i] if firstWord in j[2] ] topicWordBListL2.append(topicIWord1BList2) # weight, with actors' name topicIWord1BList3 = [ j[1] for j in aTopicNewUniBContentListString[i] if wordList2[0] in j[2] ] topicWordBListL3.append(topicIWord1BList3) print 'top 10 weight word with actor:', for i in top10weightActor: print i, print print 'top 10 weight word without actor:', for i in top10weightNoActor: print i, print plt.figure(1) plt.subplot(211) topicWordBListL = topicWordBListL3 for i in topicWordBListL: y = [topicWordBListL.index(i) for indexx in range(len(i))] plt.scatter(i, y, marker='.', color='b') # print 'len(i), len(y):', len(i), len(y), i, y plt.plot([], marker='.', color='b', label='Most frequent words') plt.xlim(0, ) plt.legend() plt.xlabel('Barrage Time(s)') plt.ylabel('Topic ID') plt.subplot(212) for i in topicWordBListL2: y = [topicWordBListL2.index(i) for indexx in range(len(i))] plt.scatter(i, y, marker='x', color='r') plt.plot([], marker='x', color='r', label='Most weight words') plt.xlim(0, ) plt.legend() plt.xlabel('Barrage Time(s)') plt.ylabel('Topic ID') # plt.show() print 'the num of users of different topics:', topicUserNumList print 'the num of users who is not in userCodeIdList:', userIdNoneNum print '-------------------------' # print topicDist topicDist2 = np.sum(topicDist, axis=1) # the percentage of tag of a user topicDist3 = np.transpose(np.transpose(topicDist) / np.float16(topicDist2)) print topicDist3, '\n' for i in range(len(topicPercDist)): for j in range(len(topicPercDist[i])): topicPercDist[i][j] = topicPercDist[i][j] / topicPercDistAddNum[i] # topicPercDist = topicPercDist/topicPercDistAddNum print 'topicPercDist, topicPercDist[1][1]:', topicPercDist, topicPercDist[ 1][1] np.savetxt('topicDist.txt', topicDist) np.savetxt('topicPercDist.txt', topicPercDist) colorList = ['b', 'c', 'g', 'k', 'm', 'r', 'y'] plt.figure(5) for i in range(len(topicPercDist)): plt.plot(topicPercDist[i], colorList[i]) plt.xlabel = u'用户收藏视频主题类型' plt.ylabel = u'用户收藏视频主题占比(各主题视频数量/所有主题视频数量)' plt.show() print topicNum, "topics possibility average value:", topicPosi, sum( topicPosi) / len(topicPosi) resIndexList = [i[0] for i in resList] resTypeList = list(set(resIndexList)) # number of type index resTCountList = [] # barrage sequence number of type index indexList = [] for index in resTypeList: resTCountList.append(resIndexList.count(index)) indexList.append([i for i, v in enumerate(resIndexList) if v == index]) # print resTCountList # print indexList[0] # type 0 print 'all barrage comments of type 000000000000000------------' # for i in indexList[0]: # print uniBContentList[i][-1] plt.xlabel("Barrage Type") plt.ylabel("Barrage Number") plt.plot(resTCountList, 'r-') # plt.show() # return 1/aveInnDis, 1/aveExtDis return aveInnDis, aveExtDis