예제 #1
0
파일: lda.py 프로젝트: honqging/GradDesign
def preLda(vCid):
    bList = xmlDL.migrateBD(vCid)

    # eg: uniBContentList: ['user', ['b1', 'b2', 'b3']]
    # eg: uniBContentList2: ['user1', '157.430', 'b1'], ['user1', '159.430', 'b2']....
    # eg: uniBContentListString: ['22ccd704', '\xe7\x88\xb7\xe7\x88\xb7QAQ']
    uniBContentList, uniBContentList2, uniBContentListString, bNumPerUser = getUserAndBarrageList(
        bList)

    # list of a user's total barrage
    # newUniBContentListString = removeBLessNum(uniBContentListString, bNumPerUser, 3)

    # list of a user's one barrage
    newUniBContentListString = removeBLessNum(uniBContentList2, bNumPerUser, 3)

    bContentList = xmlDL.divideSent(newUniBContentListString, 1)

    # remove barrage comments which have no N or A
    print len(bContentList), len(newUniBContentListString)
    for bContentI in range(len(bContentList)):
        try:
            if len(bContentList[bContentI]) == 0:
                # bContentList.remove(bContentList[bContentI])
                # newUniBContentListString.remove(newUniBContentListString[bContentI])
                del bContentList[bContentI]
                del newUniBContentListString[bContentI]
        except:
            continue
    print len(bContentList), len(newUniBContentListString)

    print '---newUniBContentListString', newUniBContentListString[3][
        0], newUniBContentListString[10][1], newUniBContentListString[10][2]
    # print '---bContentList', bContentList[3], bContentList[10]

    # sys.exit()

    # 276088 words tatally
    # 17941 unique words
    # dictionary of all words
    dic = corpora.Dictionary(bContentList)
    print type(dic)
    # for word, index in dic.token2id.iteritems():
    #     word = word.encode('utf-8')
    #     print word, index
    print 'dictionary number of docs, num_pos, number of terms: ', dic.num_docs, dic.num_pos, len(
        dic)

    # text corpus
    corpus = [dic.doc2bow(text) for text in bContentList]
    # print max(corpus)

    tfidf = models.TfidfModel(corpus)
    print tfidf, type(tfidf)
    example = [(0, 1), (2, 1)]
    print tfidf[example]
    for word, index in dic.token2id.iteritems():
        if index == 4 or index == 0:
            word = word.encode('utf-8')
            print word, index
    return dic, corpus, tfidf, bContentList, newUniBContentListString
예제 #2
0
def getScoreListFromVCid3(vCid):
    scoreList3 = []

    # userList whose barrage is larger than 3
    userList = getUserList(3)
    bList = xmlDL.migrateBD(vCid)
    for b in bList:
        word_list = jieba.lcut(b[8], cut_all=False)
        a = sentimentAnalysis.getScore(word_list)
        a = (float(b[0]), b[6]) + a + (b[8], 0)
        scoreList3.append(a)
    return scoreList3
예제 #3
0
def getUserList(num):
    userList = []
    bList = xmlDL.migrateBD(xmlDL.vCid)
    uniBContentList, uniBContentList2, uniBContentListString, bNumPerUser = lda.getUserAndBarrageList(bList)
    for i in range(len(bNumPerUser)):
        # if bNumPerUser[i][1] == 1:
        #     numOfUsersList[0] += 1
        # elif bNumPerUser[i][1] == 2:
        #     numOfUsersList[1] += 1
        # elif bNumPerUser[i][1] == 3:
        #     numOfUsersList[2] += 1
        if bNumPerUser[i][1] >= num:
            userList.append(bNumPerUser[i][0])
    return userList
예제 #4
0
파일: util.py 프로젝트: honqging/GradDesign
def writeUIdList(vCid, filePath):
    userList = xmlDL.migrateBD(vCid)[:, 6]
    uniUsers = np.unique(userList)
    print '---------totally', len(uniUsers), 'unique users in', vCid
    # print hashTrans(uniUsers[12])

    if not os.path.exists(filePath):
        os.mkdir(filePath)

    fo = open(filePath + 'userIdList.txt', "w+")
    for i in range(len(uniUsers)):
        transUserId = hashTrans(uniUsers[i])
        print transUserId
        fo.write(uniUsers[i] + ',' + str(transUserId))
        fo.write('\n')
    print '-----------all userIdList of', vCid, 'is downloaded to local'
예제 #5
0
    # userList whose barrage is larger than 3
    bList = newUniBContentListString
    for b in bList:
        word_list = jieba.lcut(b[1], cut_all=False)
        a = sentimentAnalysis.getScore(word_list)
        a = (b[0], b[1]) + a
        scoreList2.append(a)
    return scoreList2


scoreList = []
posScoreList = []
negScoreList = []

if __name__ == '__main__':
    bList = xmlDL.migrateBD(xmlDL.vCid)

    # eg: uniBContentListString: ['22ccd704', '\xe7\x88\xb7\xe7\x88\xb7QAQ']
    uniBContentList, uniBContentList2, uniBContentListString, bNumPerUser = lda.getUserAndBarrageList(
        bList)
    print 'uni', len(uniBContentListString)
    newUniBContentListString = lda.removeBLessNum(uniBContentListString,
                                                  bNumPerUser, 3)
    print 'new', len(newUniBContentListString)

    scoreList2 = getScoreListFromVCid(newUniBContentListString)
    # for i in scoreList2:
    #     for j in i:
    #         print j,
    #     print