def getWholePost(db, postId): """ get a (title, post and all its answers) """ title = None wholePost = None for question in util.iterateQuestions(db, postList=[postId]): answers = "\n\n".join([answer.body for answer in util.iterateAnswers(db, postId)]) title = question.title wholePost = "\n\n".join([question.title, question.body, answers, question.tags]) return (title, wholePost)
def getWholePost(db, postId): """ get a (title, post and all its answers) """ title = None wholePost = None for question in util.iterateQuestions(db, postList=[postId]): answers = "\n\n".join( [answer.body for answer in util.iterateAnswers(db, postId)]) title = question.title wholePost = "\n\n".join( [question.title, question.body, answers, question.tags]) return (title, wholePost)
def __iter__(self): for question in util.iterateQuestions(self.db, self.topic, self.postList): answers = [answer for answer in util.iterateAnswers(self.db, [question.id])] tokens = tokenizePost(question.title, question.body, [answer.body for answer in answers], question.tags) if Config.debug and self.ctr > 0 and (self.ctr % 5000)==0: now = time.time() print >>sys.stderr, "Posts imported:", self.ctr, "(in %0.1fs, %0.2fpost/s)" % ( (now-self.t0), self.ctr/(now-self.tbegin) ) self.t0 = now self.corpusToPost[self.ctr] = question.id self.ctr += 1 yield self.dictionary.doc2bow([utoken for utoken in self.unicodifyTokens(tokens)], allow_update=True)
def __iter__(self): for question in util.iterateQuestions(self.db, self.topic, self.postList): answers = [ answer for answer in util.iterateAnswers(self.db, [question.id]) ] tokens = tokenizePost(question.title, question.body, [answer.body for answer in answers], question.tags) if Config.debug and self.ctr > 0 and (self.ctr % 5000) == 0: now = time.time() print >> sys.stderr, "Posts imported:", self.ctr, "(in %0.1fs, %0.2fpost/s)" % ( (now - self.t0), self.ctr / (now - self.tbegin)) self.t0 = now self.corpusToPost[self.ctr] = question.id self.ctr += 1 yield self.dictionary.doc2bow( [utoken for utoken in self.unicodifyTokens(tokens)], allow_update=True)
def scoreUsers(db, query, queryResults, topicModel, cutoffPercentile=75, resultCutoff=0.5): """ return the value-weighted score of users in a set of posts the posts must be a list including .id, .post, .similarity (relevance) """ class PostDetails: def __init__(self, questionId=0, answerId=0, title="", questionRelevance=0, answerRelevance=0): self.questionId = questionId self.answerId = answerId self.title = title self.questionRelevance = questionRelevance self.answerRelevance = answerRelevance class UserScore: def __init__(self, userId, user, score, meanRelevance, postIds): self.userId = userId self.user = user self.score = score self.meanRelevance = meanRelevance self.postIds = postIds self.nPosts = len(self.postIds) print repr(self) def __repr__(self): return repr((self.user, self.userId, self.score, self.meanRelevance)) def starScore(self, cutoffPercentile=75, nStars=5): """ convert the score to a number of stars, based on percentileRank (which must be added separately)""" self.stars = int(min([nStars, (1+(self.percentileRank - cutoffPercentile - 1.0)//((100.0-cutoffPercentile)/nStars))])) return self ids = [] relevance = [] userIds = [] postIds = [] commentSentiment = [] querySim = topic_classification.TopicModeling.QuerySimilarity(topicModel, query) id2qr = { queryResult.post.id : queryResult for queryResult in queryResults } for answer in util.iterateAnswers(db, id2qr): useUserId = answer.owner_user_id if answer.owner_user_id is not None else answer.last_editor_user_id if useUserId: questionQr = id2qr[answer.parent_id] ids.append(answer.id) userIds.append(useUserId) answerRelevance = 1.0 #querySim.similarity(answer.body) relevance.append(questionQr.similarity*answerRelevance) postIds.append( PostDetails( questionId=answer.parent_id, answerId=answer.id, title=questionQr.post.title, questionRelevance=questionQr.similarity, answerRelevance=answerRelevance ) ) logging.debug("iterating answers complete, getting prescores") ages, scores, favorites, views, accepted = getAnswerPrescores(db, ids) logging.debug("got prescores...getting sentiment") commentSentimentDict = scoreCommentSentiment(db, ids) #commentSentiment = getCommentSentiments(db, ids) commentSentiment = array([commentSentimentDict.get(ident, 0) for ident in ids]) logging.debug("got sentiment...calculating scores...") scores += commentSentiment * sentimentFactor # calculate the scores of the posts for this query using a scoring heuristic postIds = array(postIds) relevance = array(relevance) accepted = array(accepted) pctScores = array([percentileofscore(scores, s, 'strict') for s in scores], double)/100.0 pctFavorites = array([percentileofscore(favorites, f, 'strict') for f in favorites], double)/100.0 pctViews = array([percentileofscore(views, v, 'strict') for v in views], double)/100.0 postScore = relevance * (1.0+pctScores) * (1.0+pctFavorites) * (1.0 + pctViews) * (1.0 + acceptedBonus * accepted) userIdSet = frozenset(userIds) userIds = array(userIds) displayNames = {userId : displayName for (userId, displayName) in zip(userIdSet, util.usersById(db, userIdSet))} userScores = [ UserScore( user, displayNames[user], postScore[userIds==user].sum(), relevance[userIds==user].mean(), postIds[userIds==user] ) for user in userIdSet] allUserScores = array([userScore.score for userScore in userScores], dtype=double) for userScore in userScores: userScore.percentileRank = percentileofscore(allUserScores, userScore.score) logging.debug("sorting users by score...") return sorted(filter(lambda us: us.percentileRank >= cutoffPercentile, userScores), key=lambda u: -u.score)