class TopicChange:
    def __init__(self, dataFile, userJoins, activeForums):
        sys.stderr.write("Started\n")
        self.dataHandler = DataHandler(dataFile, userJoins)
        self.dataHandler.loadActiveForums(activeForums)
        sys.stderr.write("Data loaded\n")
        self.post2Month = self.dataHandler.getPost2Month()
        self.doc2Post = self.dataHandler.getDoc2Post()
        self.post2User = self.dataHandler.getPost2User()
        sys.stderr.write("Got the dicts\n")

    def loadInferredTopics(self, topicsOutput):
        userMonth = dd(lambda: dd(int))
        numUsers = set()
        csvReader = csv.reader(open(topicsOutput))
        for doc in csvReader:
            # if len(doc)<21:
            #  continue
            # print 'phani'
            docId = doc[0]
            # topic5Num = doc[5]
            # topic19Num = doc[19]
            userId = self.post2User[self.doc2Post[docId]]
            # month = self.post2Month[self.doc2Post[docId]]
            # userMonth[userId][month] += topic5Num
            numUsers.add(userId)
        # for user in userMonth.iterkeys():
        #  for month in userMonth[user].iterkeys():
        #    print user, month, userMonth[user][month]
        print len(numUsers)
from DataHandler import DataHandler

if __name__ == "__main__":
    dataFile = "/usr0/home/pgadde/Work/Ethnic/Hoodup/DataExploration/SampledPosts2/TopicChange/Data/allThreads.csv"
    userJoins = "/usr0/home/pgadde/Work/Ethnic/Hoodup/DataExploration/SampledPosts2/TopicChange/Data/userJoins"
    DH = DataHandler(dataFile, userJoins)
    DH.loadActiveForums()
    outFile = "/usr0/home/pgadde/Work/Ethnic/Hoodup/DataExploration/SampledPosts2/TopicChange/Data/activeForums.tsv"
    DH.printActiveForums(outFile)
示例#3
0
import sys
from DataHandler import DataHandler

def analyzeUser(userNum, DH, baseDir):
  userNum = str(userNum)
  sys.stderr.write("User:"******"\n")
  outFile = baseDir+userNum
  DH.printMonthlyDataForUser(userNum, outFile)

if __name__ == '__main__':
  baseDir = "/usr0/home/pgadde/Work/Ethnic/Hoodup/DataExploration/SampledPosts2/DebugTime/"
  dataFile = baseDir + "allThreads.csv"
  userJoins = baseDir + "userJoins"
  activeForums = baseDir + "activeForums.csv"
  DH = DataHandler(dataFile, userJoins)
  DH.loadActiveForums(activeForums)
  
  #analyzeUser(9258, DH, baseDir)
  #analyzeUser(30702, DH, baseDir)
  analyzeUser(35541, DH, baseDir)
class preProcessor:
    def __init__(self, dataFile, userJoins):
        self.dataHandler = DataHandler(dataFile, userJoins)
        self.dataHandler.loadActiveForums()
        self.tokenizedData = []

    def prepareTokenizedCSV(self):
        self.tokenizedData = self.dataHandler.getTokenizedCSV()

    def prepareTokenizedUserMonthCSV(self):
        self.tokenizedData = self.dataHandler.getTokenizedUserMonthCSV()

    def prepareTokenizedUserMonthForumCSV(self):
        self.tokenizedData = self.dataHandler.getTokenizedUserMonthForumCSV()

    def printDataForTMT(self, outFile):
        outFile = csv.writer(open(outFile, "w"))
        index = 1
        for record in self.tokenizedData:
            record.insert(0, index)
            outFile.writerow(record)
            index += 1

    def getRequiredDataFromRecord(self, record):
        indices = [0]

    def initializeUserMonthRecord(self, user, month):
        return self.dataHandler.getBasicUserMonthRecord(user, month)

    def isProperUnicode(self, text):
        try:
            dummy = unicode(text)
            return True
        except:
            return False

    def printInferDataForTMT(self, outFile):
        f = codecs.open(outFile, encoding="utf-8", mode="w+")
        outFile = csv.writer(f)
        index = 1
        for user in self.tokenizedData.iterkeys():
            for month in self.tokenizedData[user].iterkeys():
                numPosts = 0
                userMonthRecord = self.initializeUserMonthRecord(user, month)
                for recordText in self.tokenizedData[user][month]:
                    if self.isProperUnicode(recordText):
                        userMonthRecord[-1].append(recordText)
                        numPosts += 1
                userMonthRecord[-1] = " ".join(userMonthRecord[-1])
                userMonthRecord.insert(0, index)
                userMonthRecord.append(numPosts)
                try:
                    outFile.writerow([unicode(s).encode("utf-8") for s in userMonthRecord])
                    index += 1
                except:
                    pass

    def printInferDataForTMTWithForum(self, outFile):
        f = codecs.open(outFile, encoding="utf-8", mode="w+")
        outFile = csv.writer(f)
        index = 1
        for user in self.tokenizedData.iterkeys():
            for month in self.tokenizedData[user].iterkeys():
                totalPosts = 0
                allForumsRecord = self.initializeUserMonthRecord(user, month)
                for forum in self.tokenizedData[user][month]:
                    numPosts = 0
                    userMonthRecord = self.initializeUserMonthRecord(user, month)
                    for recordText in self.tokenizedData[user][month][forum]:
                        if self.isProperUnicode(recordText):
                            userMonthRecord[-1].append(recordText)
                            numPosts += 1
                    forumPosts = copy.deepcopy(userMonthRecord[-1])
                    userMonthRecord[-1] = " ".join(userMonthRecord[-1])
                    userMonthRecord.insert(0, index)
                    userMonthRecord.append(numPosts)
                    userMonthRecord.append(forum)
                    try:
                        outFile.writerow([unicode(s).encode("utf-8") for s in userMonthRecord])
                        index += 1
                        totalPosts += numPosts
                        allForumsRecord[-1].extend(forumPosts)
                    except:
                        pass
                allForumsRecord[-1] = " ".join(allForumsRecord[-1])
                allForumsRecord.insert(0, index)
                index += 1
                allForumsRecord.append(totalPosts)
                allForumsRecord.append("AllForums")
                outFile.writerow([unicode(s).encode("utf-8") for s in allForumsRecord])
class UserwiseDivergenceAnalysis:
  def __init__(self, dataFile, userJoins):
    sys.stderr.write('In Constructor\n')
    self.distComparer = DistComparer()
    self.dataHandler = DataHandler(dataFile, userJoins)
    self.dataHandler.loadActiveForums()
    self.__loadData()
    self.sampledUsers = set()
  
  def __loadData(self):
    stopWords = set([s.strip() for s in open("/usr0/home/pgadde/Work/Ethnic/Hoodup/DataExploration/SampledPosts/Regression/stopWords")])
    self.dataHandler.preprocessVocab(stopWords)

  def sampleUsers(self):
    #self.dataHandler.userStats(outFile)
    self.sampledUsers = self.dataHandler.sampleUsers()

  def doDivergenceAnalysisPerUser(self, outFile):
    outFile = open(outFile,'w')
    for user in self.sampledUsers:
      #print "User:"******"Month:",month
        outFile.write(str(user)+'\t'+str(month)+'\t'+str(userDivergences[month][0])+'\t'+str(userDivergences[month][1])+'\n')
    outFile.close()
    
  def prepareUserDivergencesActive(self, userNum):
    divergences = {}
    userMonths = self.dataHandler.getUserMonths(userNum)
    activeForum  = self.dataHandler.getActiveForum(userNum)
    if activeForum.find("Talk")<0:
      return -1
    userInitialData = self.dataHandler.makeDist(self.dataHandler.getForumInitialData(self.dataHandler.getActiveForum(userNum)))
    userMaturedData = self.dataHandler.makeDist(self.dataHandler.getForumMaturedData(self.dataHandler.getActiveForum(userNum)))
    for userMonth in userMonths:
      monthData = self.dataHandler.makeDist(self.dataHandler.getUserDataForDivergence(userNum, userMonth))
      divergences[userMonth] = (self.distComparer.jsDivergence(userInitialData,monthData), self.distComparer.jsDivergence(monthData, userMaturedData))
    return divergences

  def prepareUserDivergencesBackground(self, userNum):
    divergences = {}
    userMonths = self.__dataHandler.getUserMonths(userNum)
    userInitialData = self.dataHandler.makeDist(self.dataHandler.getForumInitialData("AllTalk"))
    userMaturedData = self.dataHandler.makeDist(self.dataHandler.getForumMaturedData("AllTalk"))
    for userMonth in userMonths:
      monthData = self.dataHandler.makeDist(self.dataHandler.getUserDataForDivergence(userNum, userMonth))
      divergences[userMonth] = (self.distComparer.jsDivergence(userInitialData,monthData), self.distComparer.jsDivergence(monthData, userMaturedData))
    return divergences

  def prepareUserDivergences(self, userNum):
    divergences = {}
    userMonths = self.dataHandler.getUserMonths(userNum)
    userInitialData = self.dataHandler.makeDist(self.dataHandler.getUserInitialData(userNum))
    userMaturedData = self.dataHandler.makeDist(self.dataHandler.getUserMaturedData(userNum))
    for userMonth in userMonths:
      monthData = self.dataHandler.makeDist(self.dataHandler.getUserDataForDivergence(userNum, userMonth))
      divergences[userMonth] = (self.distComparer.jsDivergence(userInitialData,monthData), self.distComparer.jsDivergence(monthData, userMaturedData))
    return divergences