class TopicChange: def __init__(self, dataFile, userJoins, activeForums): sys.stderr.write("Started\n") self.dataHandler = DataHandler(dataFile, userJoins) self.dataHandler.loadActiveForums(activeForums) sys.stderr.write("Data loaded\n") self.post2Month = self.dataHandler.getPost2Month() self.doc2Post = self.dataHandler.getDoc2Post() self.post2User = self.dataHandler.getPost2User() sys.stderr.write("Got the dicts\n") def loadInferredTopics(self, topicsOutput): userMonth = dd(lambda: dd(int)) numUsers = set() csvReader = csv.reader(open(topicsOutput)) for doc in csvReader: # if len(doc)<21: # continue # print 'phani' docId = doc[0] # topic5Num = doc[5] # topic19Num = doc[19] userId = self.post2User[self.doc2Post[docId]] # month = self.post2Month[self.doc2Post[docId]] # userMonth[userId][month] += topic5Num numUsers.add(userId) # for user in userMonth.iterkeys(): # for month in userMonth[user].iterkeys(): # print user, month, userMonth[user][month] print len(numUsers)
from DataHandler import DataHandler if __name__ == "__main__": dataFile = "/usr0/home/pgadde/Work/Ethnic/Hoodup/DataExploration/SampledPosts2/TopicChange/Data/allThreads.csv" userJoins = "/usr0/home/pgadde/Work/Ethnic/Hoodup/DataExploration/SampledPosts2/TopicChange/Data/userJoins" DH = DataHandler(dataFile, userJoins) DH.loadActiveForums() outFile = "/usr0/home/pgadde/Work/Ethnic/Hoodup/DataExploration/SampledPosts2/TopicChange/Data/activeForums.tsv" DH.printActiveForums(outFile)
import sys from DataHandler import DataHandler def analyzeUser(userNum, DH, baseDir): userNum = str(userNum) sys.stderr.write("User:"******"\n") outFile = baseDir+userNum DH.printMonthlyDataForUser(userNum, outFile) if __name__ == '__main__': baseDir = "/usr0/home/pgadde/Work/Ethnic/Hoodup/DataExploration/SampledPosts2/DebugTime/" dataFile = baseDir + "allThreads.csv" userJoins = baseDir + "userJoins" activeForums = baseDir + "activeForums.csv" DH = DataHandler(dataFile, userJoins) DH.loadActiveForums(activeForums) #analyzeUser(9258, DH, baseDir) #analyzeUser(30702, DH, baseDir) analyzeUser(35541, DH, baseDir)
class preProcessor: def __init__(self, dataFile, userJoins): self.dataHandler = DataHandler(dataFile, userJoins) self.dataHandler.loadActiveForums() self.tokenizedData = [] def prepareTokenizedCSV(self): self.tokenizedData = self.dataHandler.getTokenizedCSV() def prepareTokenizedUserMonthCSV(self): self.tokenizedData = self.dataHandler.getTokenizedUserMonthCSV() def prepareTokenizedUserMonthForumCSV(self): self.tokenizedData = self.dataHandler.getTokenizedUserMonthForumCSV() def printDataForTMT(self, outFile): outFile = csv.writer(open(outFile, "w")) index = 1 for record in self.tokenizedData: record.insert(0, index) outFile.writerow(record) index += 1 def getRequiredDataFromRecord(self, record): indices = [0] def initializeUserMonthRecord(self, user, month): return self.dataHandler.getBasicUserMonthRecord(user, month) def isProperUnicode(self, text): try: dummy = unicode(text) return True except: return False def printInferDataForTMT(self, outFile): f = codecs.open(outFile, encoding="utf-8", mode="w+") outFile = csv.writer(f) index = 1 for user in self.tokenizedData.iterkeys(): for month in self.tokenizedData[user].iterkeys(): numPosts = 0 userMonthRecord = self.initializeUserMonthRecord(user, month) for recordText in self.tokenizedData[user][month]: if self.isProperUnicode(recordText): userMonthRecord[-1].append(recordText) numPosts += 1 userMonthRecord[-1] = " ".join(userMonthRecord[-1]) userMonthRecord.insert(0, index) userMonthRecord.append(numPosts) try: outFile.writerow([unicode(s).encode("utf-8") for s in userMonthRecord]) index += 1 except: pass def printInferDataForTMTWithForum(self, outFile): f = codecs.open(outFile, encoding="utf-8", mode="w+") outFile = csv.writer(f) index = 1 for user in self.tokenizedData.iterkeys(): for month in self.tokenizedData[user].iterkeys(): totalPosts = 0 allForumsRecord = self.initializeUserMonthRecord(user, month) for forum in self.tokenizedData[user][month]: numPosts = 0 userMonthRecord = self.initializeUserMonthRecord(user, month) for recordText in self.tokenizedData[user][month][forum]: if self.isProperUnicode(recordText): userMonthRecord[-1].append(recordText) numPosts += 1 forumPosts = copy.deepcopy(userMonthRecord[-1]) userMonthRecord[-1] = " ".join(userMonthRecord[-1]) userMonthRecord.insert(0, index) userMonthRecord.append(numPosts) userMonthRecord.append(forum) try: outFile.writerow([unicode(s).encode("utf-8") for s in userMonthRecord]) index += 1 totalPosts += numPosts allForumsRecord[-1].extend(forumPosts) except: pass allForumsRecord[-1] = " ".join(allForumsRecord[-1]) allForumsRecord.insert(0, index) index += 1 allForumsRecord.append(totalPosts) allForumsRecord.append("AllForums") outFile.writerow([unicode(s).encode("utf-8") for s in allForumsRecord])
class UserwiseDivergenceAnalysis: def __init__(self, dataFile, userJoins): sys.stderr.write('In Constructor\n') self.distComparer = DistComparer() self.dataHandler = DataHandler(dataFile, userJoins) self.dataHandler.loadActiveForums() self.__loadData() self.sampledUsers = set() def __loadData(self): stopWords = set([s.strip() for s in open("/usr0/home/pgadde/Work/Ethnic/Hoodup/DataExploration/SampledPosts/Regression/stopWords")]) self.dataHandler.preprocessVocab(stopWords) def sampleUsers(self): #self.dataHandler.userStats(outFile) self.sampledUsers = self.dataHandler.sampleUsers() def doDivergenceAnalysisPerUser(self, outFile): outFile = open(outFile,'w') for user in self.sampledUsers: #print "User:"******"Month:",month outFile.write(str(user)+'\t'+str(month)+'\t'+str(userDivergences[month][0])+'\t'+str(userDivergences[month][1])+'\n') outFile.close() def prepareUserDivergencesActive(self, userNum): divergences = {} userMonths = self.dataHandler.getUserMonths(userNum) activeForum = self.dataHandler.getActiveForum(userNum) if activeForum.find("Talk")<0: return -1 userInitialData = self.dataHandler.makeDist(self.dataHandler.getForumInitialData(self.dataHandler.getActiveForum(userNum))) userMaturedData = self.dataHandler.makeDist(self.dataHandler.getForumMaturedData(self.dataHandler.getActiveForum(userNum))) for userMonth in userMonths: monthData = self.dataHandler.makeDist(self.dataHandler.getUserDataForDivergence(userNum, userMonth)) divergences[userMonth] = (self.distComparer.jsDivergence(userInitialData,monthData), self.distComparer.jsDivergence(monthData, userMaturedData)) return divergences def prepareUserDivergencesBackground(self, userNum): divergences = {} userMonths = self.__dataHandler.getUserMonths(userNum) userInitialData = self.dataHandler.makeDist(self.dataHandler.getForumInitialData("AllTalk")) userMaturedData = self.dataHandler.makeDist(self.dataHandler.getForumMaturedData("AllTalk")) for userMonth in userMonths: monthData = self.dataHandler.makeDist(self.dataHandler.getUserDataForDivergence(userNum, userMonth)) divergences[userMonth] = (self.distComparer.jsDivergence(userInitialData,monthData), self.distComparer.jsDivergence(monthData, userMaturedData)) return divergences def prepareUserDivergences(self, userNum): divergences = {} userMonths = self.dataHandler.getUserMonths(userNum) userInitialData = self.dataHandler.makeDist(self.dataHandler.getUserInitialData(userNum)) userMaturedData = self.dataHandler.makeDist(self.dataHandler.getUserMaturedData(userNum)) for userMonth in userMonths: monthData = self.dataHandler.makeDist(self.dataHandler.getUserDataForDivergence(userNum, userMonth)) divergences[userMonth] = (self.distComparer.jsDivergence(userInitialData,monthData), self.distComparer.jsDivergence(monthData, userMaturedData)) return divergences