Пример #1
0
def updateMixtureModel(data, params, hyperParams):
  # Initialize parameter data structs
  C = params.C
  K = params.K
  componentCompressedData = []
  for c in range(0, C): componentCompressedData.append(DME.CompressedRowData(K))
  mixtureCounts = [0.]*C

  # Loop through the data and update param data structs
  for row in data:
    cProbs = getComponentProbabilitiesForCounts(row, params)
    for c in range(0, C):
      cProb = cProbs[c]
      componentCompressedData[c].appendRow(row, cProb)
      mixtureCounts[c] += cProb

  # Compute information for new model
  dirichlets = []
  for c in range(0, C):
    D = DME.findDirichletPriors(componentCompressedData[c], [1.]*K, 50, hyperParams.beta, hyperParams.W)
    dirichlets.append(D)
  
  mixtureD = map(lambda c: mixtureCounts[c] + hyperParams.mixtureDirich[c], range(0, C))
  S = sum(mixtureD)
  mixture = map(lambda x: float(x) / S, mixtureD)

  return DirichletMixtureModel(C, K, dirichlets, mixture)
Пример #2
0
def initMixtureModel(data, hyperParams):
  # Initialize parameter data structs
  C = hyperParams.C
  K = hyperParams.K
  componentCompressedData = []
  for c in range(0, C): componentCompressedData.append(DME.CompressedRowData(K))
  mixtureCounts = [0.]*C

  # Loop through the data and update param data structs
  for n in range(0, len(data)):
    c = n % C
    row = data[n]
    componentCompressedData[c].appendRow(row, 1)
    mixtureCounts[c] += 1

  # Compute information for new model
  dirichlets = []
  for c in range(0, C):
    D = DME.findDirichletPriors(componentCompressedData[c], [1.]*K, 50, hyperParams.beta, hyperParams.W)
    dirichlets.append(D)

  mixtureD = map(lambda c: mixtureCounts[c] + hyperParams.mixtureDirich[c], range(0, C))
  S = sum(mixtureD)
  mixture = map(lambda x: float(x) / S, mixtureD)

  return DirichletMixtureModel(C, K, dirichlets, mixture)
def initMixtureModel(data, hyperParams):
    # Initialize parameter data structs
    C = hyperParams.C
    K = hyperParams.K
    componentCompressedData = []
    for c in range(0, C):
        componentCompressedData.append(DME.CompressedRowData(K))
    mixtureCounts = [0.] * C

    # Loop through the data and update param data structs
    for n in range(0, len(data)):
        c = n % C
        row = data[n]
        componentCompressedData[c].appendRow(row, 1)
        mixtureCounts[c] += 1

    # Compute information for new model
    dirichlets = []
    for c in range(0, C):
        D = DME.findDirichletPriors(componentCompressedData[c], [1.] * K, 50,
                                    hyperParams.beta, hyperParams.W)
        dirichlets.append(D)

    mixtureD = map(lambda c: mixtureCounts[c] + hyperParams.mixtureDirich[c],
                   range(0, C))
    S = sum(mixtureD)
    mixture = map(lambda x: float(x) / S, mixtureD)

    return DirichletMixtureModel(C, K, dirichlets, mixture)
def updateMixtureModel(data, params, hyperParams):
    # Initialize parameter data structs
    C = params.C
    K = params.K
    componentCompressedData = []
    for c in range(0, C):
        componentCompressedData.append(DME.CompressedRowData(K))
    mixtureCounts = [0.] * C

    # Loop through the data and update param data structs
    for row in data:
        cProbs = getComponentProbabilitiesForCounts(row, params)
        for c in range(0, C):
            cProb = cProbs[c]
            componentCompressedData[c].appendRow(row, cProb)
            mixtureCounts[c] += cProb

    # Compute information for new model
    dirichlets = []
    for c in range(0, C):
        D = DME.findDirichletPriors(componentCompressedData[c], [1.] * K, 50,
                                    hyperParams.beta, hyperParams.W)
        dirichlets.append(D)

    mixtureD = map(lambda c: mixtureCounts[c] + hyperParams.mixtureDirich[c],
                   range(0, C))
    S = sum(mixtureD)
    mixture = map(lambda x: float(x) / S, mixtureD)

    return DirichletMixtureModel(C, K, dirichlets, mixture)
Пример #5
0
            priors[i] += data[i] * weightForMean
        dataObj.appendRow(data, 1)

    if (idx % 1000000) == 0: logging.debug("Loading Data: %s rows done" % idx)

dataLoadTime = time.time()
logging.debug("loaded %s records into memory" % idx)
logging.debug("time to load memory: %s " % (dataLoadTime - startTime))

for row in dataObj.U:
    if len(row) == 0 and not hasHyperprior:
        # TODO(max): write up a paper describing the hyperprior and link it.
        raise Exception(
            "You can't have any columns with all 0s, unless you provide a hyperprior (-H)"
        )

initPriorWeight = 1
priorSum = sum(priors) + 0.01  # Nudge to prevent zero
for i in range(0, K):
    priors[i] /= priorSum
    priors[i] += 0.01  # Nudge to prevent zero

priors = DME.findDirichletPriors(dataObj, priors, iterations, Beta, W)

print "Final priors: ", priors
logging.debug("Final average loss: %s" %
              DME.getTotalLoss(priors, dataObj, Beta, W))

totalTime = time.time() - dataLoadTime
logging.debug("Time to calculate: %s" % totalTime)
Пример #6
0
		
		sumData = sum(data)
		weightForMean = 1.0 / (1.0 + sumData)
		for i in range(0, K): priors[i] += data[i] * weightForMean
		dataObj.appendRow(data, 1)

	if (idx % 1000000) == 0: logging.debug("Loading Data: %s rows done" % idx)

dataLoadTime = time.time()
logging.debug("loaded %s records into memory" % idx)
logging.debug("time to load memory: %s " % (dataLoadTime - startTime))

for row in dataObj.U:
	if len(row) == 0 and not hasHyperprior:
		# TODO(max): write up a paper describing the hyperprior and link it.
		raise Exception("You can't have any columns with all 0s, unless you provide a hyperprior (-H)")

initPriorWeight = 1
priorSum = sum(priors) + 0.01 # Nudge to prevent zero
for i in range(0, K):
  priors[i] /= priorSum
  priors[i] += 0.01 # Nudge to prevent zero

priors = DME.findDirichletPriors(dataObj, priors, iterations, Beta, W)	

print "Final priors: ", priors
logging.debug("Final average loss: %s" % DME.getTotalLoss(priors, dataObj, Beta, W))

totalTime = time.time() - dataLoadTime
logging.debug("Time to calculate: %s" % totalTime)
Пример #7
0
for alphas in alphasList:
    for N in [10, 100, 1000, 10000, 100000, 1000000, 10000000]:
        print
        print "****************************************"
        print "alphas = ", alphas
        print
        K = len(alphas)

        for M in [5]:
            errors = []

            for i in range(0, 1000):
                uMatrix = Sample.generateRandomDataset(M, N, alphas)
                vVector = [N] * M
                init = [1.0 / K] * K
                MLEPriors = DME.findDirichletPriors(uMatrix, vVector, init,
                                                    False)
                errors.append(getError(alphas, MLEPriors))

            errors.sort()

            print "\t".join(
                map(str,
                    [N, M, errors[300], errors[500], errors[700], errors[900]
                     ]))

        # Test the M = infinity case
        errors = []

        for i in range(0, 1000):
            ss = Sample.generateRandomDirichletsSS(N, alphas)
            init = [1.0 / K] * K
Пример #8
0
def main(K, iterations, H, input_stream, sampleRate, M):
	startTime = time.time()
	logging.debug("K = " + str(K))
	logging.debug("iterations = " + str(iterations))
	logging.debug("H = " + str(H))
	logging.debug("sampleRate = " + str(sampleRate))
	logging.debug("M = " + str(M))

	# TODO(max): write up a paper describing the hyperprior and link it.
	W = 0
	Beta = [0]*K
	Hstr = H.split(",")
	hasHyperprior = False
	if (len(Hstr) == K + 1):
		for i in range(0, K): Beta[i] = float(Hstr[i])
		W = float(Hstr[K])
		hasHyperprior = True
	else:
		Beta = None
		W = None

	logging.debug("Beta = " + str(Beta))
	logging.debug("W = " + str(W))
	
	#####
	# Load Data
	#####
	csv.field_size_limit(1000000000)
	reader = csv.reader(input_stream, delimiter='\t')
	logging.debug("Loading data")
	priors = [0.]*K

	dataObj = DME.CompressedRowData(K)

	idx = 0
	for row in reader:
		idx += 1

		if (random.random() < float(sampleRate)):
			data = map(int, row)
			if (len(data) != K):
				logging.error("There are %s categories, but line has %s counts." % (K, len(data)))
				logging.error("line %s: %s" % (i, data))
			
			
			while sum(data) > M: data[Sample.drawCategory(data)] -= 1
			
			sumData = sum(data)
			weightForMean = 1.0 / (1.0 + sumData)
			for i in range(0, K): priors[i] += data[i] * weightForMean
			dataObj.appendRow(data, 1)

		if (idx % 1000000) == 0: logging.debug("Loading Data: %s rows done" % idx)

	dataLoadTime = time.time()
	logging.debug("loaded %s records into memory" % idx)
	logging.debug("time to load memory: %s " % (dataLoadTime - startTime))

	for row in dataObj.U:
		if len(row) == 0 and not hasHyperprior:
			# TODO(max): write up a paper describing the hyperprior and link it.
			raise Exception("You can't have any columns with all 0s, unless you provide a hyperprior (-H)")

	priorSum = sum(priors) + 0.01 # Nudge to prevent zero
	for i in range(0, K):
	  priors[i] /= priorSum
	  priors[i] += 0.01 # Nudge to prevent zero

	priors = DME.findDirichletPriors(dataObj, priors, iterations, Beta, W)	

	# print "Final priors: ", priors
	logging.debug("Final average loss: %s" % DME.getTotalLoss(priors, dataObj, Beta, W))

	totalTime = time.time() - dataLoadTime
	logging.debug("Time to calculate: %s" % totalTime)
	return priors
Пример #9
0
		sumData = sum(data)
		weightForMean = 1.0 / (1.0 + sumData)
		for i in range(0, K): priors[i] += data[i] * weightForMean
		dataObj.appendRow(data, 1)

	if (idx % 1000000) == 0: logging.debug("Loading Data: %s rows done" % idx)

dataLoadTime = time.time()
logging.debug("loaded %s records into memory" % idx)
logging.debug("time to load memory: %s " % (dataLoadTime - startTime))

# TODO(max): Figure out what to do with the all-zero column
#for row in dataObj.U:
#  if len(row) == 0 and not hasHyperprior:
#    # TODO(max): write up a paper describing the hyperprior and link it.
#    raise Exception("You can't have any columns with all 0s, unless you provide a hyperprior (-H)")

initPriorWeight = 1
priorSum = sum(priors) + 0.01 # Nudge to prevent zero
for i in range(0, K):
  priors[i] /= priorSum
  priors[i] += 0.01 # Nudge to prevent zero

priors = DME.findDirichletPriors(dataObj, priors, iterations, hyperprior)	

print "Final priors: ", priors
logging.debug("Final average loss: %s" % DME.getTotalLoss(priors, dataObj, hyperprior))

totalTime = time.time() - dataLoadTime
logging.debug("Time to calculate: %s" % totalTime)
Пример #10
0
        for i in range(0, K):
            priors[i] += data[i] * weightForMean
        dataObj.appendRow(data, 1)

    if (idx % 1000000) == 0: logging.debug("Loading Data: %s rows done" % idx)

dataLoadTime = time.time()
logging.debug("loaded %s records into memory" % idx)
logging.debug("time to load memory: %s " % (dataLoadTime - startTime))

# TODO(max): Figure out what to do with the all-zero column
#for row in dataObj.U:
#  if len(row) == 0 and not hasHyperprior:
#    # TODO(max): write up a paper describing the hyperprior and link it.
#    raise Exception("You can't have any columns with all 0s, unless you provide a hyperprior (-H)")

initPriorWeight = 1
priorSum = sum(priors) + 0.01  # Nudge to prevent zero
for i in range(0, K):
    priors[i] /= priorSum
    priors[i] += 0.01  # Nudge to prevent zero

priors = DME.findDirichletPriors(dataObj, priors, iterations, hyperprior)

print "Final priors: ", priors
logging.debug("Final average loss: %s" %
              DME.getTotalLoss(priors, dataObj, hyperprior))

totalTime = time.time() - dataLoadTime
logging.debug("Time to calculate: %s" % totalTime)
Пример #11
0
		weightForMean = 1.0 / (1.0 + sumData)
		for i in range(0, K): 
			priors[i] += data[i] * weightForMean
			uVector = uMatrix[i]
			for j in range(0, data[i]):
				if (len(uVector) == j): uVector.append(0)
				uVector[j] += 1
			
		for j in range(0, sumData):
			if (len(vVector) == j): vVector.append(0)
			vVector[j] += 1

	if (i % 1000000) == 0: print "Loading Data", i

dataLoadTime = time.time()
print "all data loaded into memory"
print "time to load memory: ", dataLoadTime - startTime

initPriorWeight = 1
priorSum = sum(priors)
for i in range(0, K): priors[i] /= priorSum

verbose = options.V == "True"
priors = DME.findDirichletPriors(uMatrix, vVector, priors, verbose)	
print "Final priors: ", priors
print "Final average loss:", DME.getTotalLoss(priors, uMatrix, vVector)

totalTime = time.time() - dataLoadTime
print "Time to calculate: " + str(totalTime)
	
	
Пример #12
0
for alphas in alphasList:
  for N in [10, 100, 1000, 10000, 100000, 1000000, 10000000]:
    print
    print "****************************************"
    print "alphas = ", alphas
    print
    K = len(alphas)
  
    for M in [5]:
      errors = []

      for i in range(0, 1000):
        uMatrix = Sample.generateRandomDataset(M, N, alphas)
        vVector = [N]*M
        init = [1.0 / K]*K
        MLEPriors = DME.findDirichletPriors(uMatrix, vVector, init, False)
        errors.append(getError(alphas, MLEPriors))

      errors.sort()

      print "\t".join(map(str, [N, M, errors[300], errors[500], errors[700], errors[900]]))

    # Test the M = infinity case
    errors = []

    for i in range(0, 1000):
      ss = Sample.generateRandomDirichletsSS(N, alphas)
      init = [1.0 / K]*K
      MLEPriors = DE.findDirichletPriors(ss, init, False)
      error = getError(alphas, MLEPriors)
      errors.append(error)