def sampleRow(self, amount): category = ST.drawCategory(self.mixture) multinomial = self.multinomials[category] retVal = [0]*self.K for i in range(0, amount): k = ST.drawCategory(multinomial) retVal[k] += 1 return retVal
def sampleRow(self, amount): category = ST.drawCategory(self.mixture) multinomial = self.multinomials[category] retVal = [0] * self.K for i in range(0, amount): k = ST.drawCategory(multinomial) retVal[k] += 1 return retVal
def sampleRow(self, amount): category = ST.drawCategory(self.mixture) dirichlet = self.dirichlets[category] multinomial = ST.drawFromDirichlet(dirichlet) retVal = [0]*self.K for i in range(0, amount): k = ST.drawCategory(multinomial) retVal[k] += 1 return retVal
def sampleRow(self, amount): category = ST.drawCategory(self.mixture) dirichlet = self.dirichlets[category] multinomial = ST.drawFromDirichlet(dirichlet) retVal = [0] * self.K for i in range(0, amount): k = ST.drawCategory(multinomial) retVal[k] += 1 return retVal
def sampleRow(self, amount): c = ST.drawCategory(self.multinomialMixture.mixture) if (self.mixtureNodes[c]): return self.mixtureNodes[c].sampleRow(amount) multinomial = self.multinomialMixture.multinomials[category] retVal = [0]*self.K for i in range(0, amount): k = ST.drawCategory(multinomial) retVal[k] += 1 return retVal
dataObj = DME.CompressedRowData(K) idx = 0 for row in reader: idx += 1 if (random.random() < float(options.sampleRate)): data = map(int, row) if (len(data) != K): logging.error("There are %s categories, but line has %s counts." % (K, len(data))) logging.error("line %s: %s" % (i, data)) while sum(data) > options.M: data[Sample.drawCategory(data)] -= 1 sumData = sum(data) weightForMean = 1.0 / (1.0 + sumData) for i in range(0, K): priors[i] += data[i] * weightForMean dataObj.appendRow(data, 1) if (idx % 1000000) == 0: logging.debug("Loading Data: %s rows done" % idx) dataLoadTime = time.time() logging.debug("loaded %s records into memory" % idx) logging.debug("time to load memory: %s " % (dataLoadTime - startTime)) for row in dataObj.U: if len(row) == 0 and not hasHyperprior:
priors = [0.]*K dataObj = DME.CompressedRowData(K) idx = 0 for row in reader: idx += 1 if (random.random() < float(options.sampleRate)): data = map(int, row) if (len(data) != K): logging.error("There are %s categories, but line has %s counts." % (K, len(data))) logging.error("line %s: %s" % (idx, data)) while sum(data) > options.M: data[Sample.drawCategory(data)] -= 1 sumData = sum(data) weightForMean = 1.0 / (1.0 + sumData) for i in range(0, K): priors[i] += data[i] * weightForMean dataObj.appendRow(data, 1) if (idx % 1000000) == 0: logging.debug("Loading Data: %s rows done" % idx) dataLoadTime = time.time() logging.debug("loaded %s records into memory" % idx) logging.debug("time to load memory: %s " % (dataLoadTime - startTime)) for row in dataObj.U: if len(row) == 0 and not hasHyperprior: # TODO(max): write up a paper describing the hyperprior and link it.
def main(K, iterations, H, input_stream, sampleRate, M): startTime = time.time() logging.debug("K = " + str(K)) logging.debug("iterations = " + str(iterations)) logging.debug("H = " + str(H)) logging.debug("sampleRate = " + str(sampleRate)) logging.debug("M = " + str(M)) # TODO(max): write up a paper describing the hyperprior and link it. W = 0 Beta = [0]*K Hstr = H.split(",") hasHyperprior = False if (len(Hstr) == K + 1): for i in range(0, K): Beta[i] = float(Hstr[i]) W = float(Hstr[K]) hasHyperprior = True else: Beta = None W = None logging.debug("Beta = " + str(Beta)) logging.debug("W = " + str(W)) ##### # Load Data ##### csv.field_size_limit(1000000000) reader = csv.reader(input_stream, delimiter='\t') logging.debug("Loading data") priors = [0.]*K dataObj = DME.CompressedRowData(K) idx = 0 for row in reader: idx += 1 if (random.random() < float(sampleRate)): data = map(int, row) if (len(data) != K): logging.error("There are %s categories, but line has %s counts." % (K, len(data))) logging.error("line %s: %s" % (i, data)) while sum(data) > M: data[Sample.drawCategory(data)] -= 1 sumData = sum(data) weightForMean = 1.0 / (1.0 + sumData) for i in range(0, K): priors[i] += data[i] * weightForMean dataObj.appendRow(data, 1) if (idx % 1000000) == 0: logging.debug("Loading Data: %s rows done" % idx) dataLoadTime = time.time() logging.debug("loaded %s records into memory" % idx) logging.debug("time to load memory: %s " % (dataLoadTime - startTime)) for row in dataObj.U: if len(row) == 0 and not hasHyperprior: # TODO(max): write up a paper describing the hyperprior and link it. raise Exception("You can't have any columns with all 0s, unless you provide a hyperprior (-H)") priorSum = sum(priors) + 0.01 # Nudge to prevent zero for i in range(0, K): priors[i] /= priorSum priors[i] += 0.01 # Nudge to prevent zero priors = DME.findDirichletPriors(dataObj, priors, iterations, Beta, W) # print "Final priors: ", priors logging.debug("Final average loss: %s" % DME.getTotalLoss(priors, dataObj, Beta, W)) totalTime = time.time() - dataLoadTime logging.debug("Time to calculate: %s" % totalTime) return priors
csv.field_size_limit(1000000000) reader = csv.reader(sys.stdin, delimiter='\t') logging.debug("Loading data") dataObj = [] idx = 0 for row in reader: idx += 1 if (random.random() < float(options.sampleRate)): data = map(int, row) if (len(data) != K): logging.error("There are %s categories, but line has %s counts." % (K, len(data))) logging.error("line %s: %s" % (i, data)) while sum(data) > options.M: data[Sample.drawCategory(data)] -= 1 dataObj.append(data) if (idx % 1000000) == 0: logging.debug("Loading Data: %s rows done" % idx) dataLoadTime = time.time() logging.debug("loaded %s records into memory" % idx) logging.debug("time to load memory: %s " % (dataLoadTime - startTime)) # TODO(max): enforce this #for row in dataObj: # if len(row) == 0 and not hasHyperprior: # # TODO(max): write up a paper describing the hyperprior and link it. # raise Exception("You can't have any columns with all 0s, unless you provide a hyperprior (-H)") # Mixture hyperparams (the mixture itself has a dirichlet prior)