def construct(filename): oldj = {} oj = {} newj = {} newe = {} newf = {} with open(filename, "r") as fh: c = 0 for line in fh: c += 1 if c > 300: break pcs = line.split() px = pcs[0].split("_") f = px[0] e = px[2] old = int(pcs[1]) new = int(pcs[2]) oldj[(e, f)] = old oj[pcs[0]] = old newj[pcs[0]] = new newe[e] = newe.get(e, 0) + new newf[f] = newf.get(f, 0) + new normalize(oldj) normalize(oj) normalize(newj) normalize(newe) normalize(newf) nonzeroentries = {} for e in newe: for f in newf: nonzeroentries[(e, f)] = 1 model = constructModel(oldj, nonzeroentries, newe, newf) # verbose output with timer("opt") as tim: model.params.outputflag = 1 model.optimize() model.printStats() model.printQuality() # show the final output print print "==== final variable values ====" osum = 0 nsum = 0 for var in model.getVars(): name = var.getAttr(grb.GRB.attr.VarName) val = var.getAttr(grb.GRB.attr.X) if not name.startswith("b"): o = oj.get(name, 1e-9) n = newj.get(name, 1e-9) val = max(val, 1e-9) # if val > 1e-6: print name, "\t", o, '\t', n, '\t', val osum += o * log(o / val) nsum += n * log(n / val) print osum print nsum
def cosSim(m1, m2, keep=10): with timer('computing cosine sim') as tim: E = dict() for r in m1: for (k,v) in r.iteritems(): E[k] = 1 print 'source term count : ', len(E) t = dict() for e in E: if len(t) % 100 == 0: print len(t) s = 0 rr = dict() for d in range(len(m1)): r1 = m1[d] r2 = m2[d] if e not in r1: continue se = r1[e] for (f, sf) in r2.iteritems(): rr[f] = rr.get(f, 0) + se * sf rr2 = dict() for (k, v) in sorted(rr.iteritems(), reverse=True, key=lambda p: p[1]): rr2[k] = v if len(rr2) >= keep: break t[e] = rr2 return t
def __init__(self, lines): with timer('loading phrases') as tim: count = 0 for line in lines: phrase = PhrasePair(line) key = ' '.join(phrase.source) l = self.d.get(key, []) l.append(phrase) self.d[key] = l count += 1 if 0 == count % 10000: print "{0} phrases \r".format(count), ; stdout.flush() if count == 100000: break self.count = count
def pruneRelativeEntropy(filename, outfile): pt = PhraseTable(fileLines(filename)) mapFn = lambda line: computeRelEnt(pt, line) with timer('pruning') as tim: with openMaybeGz(outfile, 'w') as o: count = 0 chunksize = 100 for line in threaded_map(mapFn, fileChunks(filename, chunksize), threadCount = 6, maxInputQ = 1024): o.write(line) count += chunksize if 0 == count % 500: (elapsed, remaining, totalTime) = tim.predict(count, pt.count) print "{0:.3f} elapsed; {1:.3f} remaining; {2:.3f} total; count = {3} \r".format(elapsed, remaining, totalTime, count), ; stdout.flush()
def normalizeMatrix(matrix): with timer('normalizing matrix') as tim: s = dict() for row in matrix: for (k,v) in row.iteritems(): s[k] = s.get(k, 0) + v * v for (k,v) in s.iteritems(): s[k] = sqrt(v) nmatrix = [] for row in matrix: nrow = dict() for (k, v) in row.iteritems(): nrow[k] = v / s[k] nmatrix.append(nrow) return nmatrix
def countMatrixToBm25(matrix): with timer('counts->bm25') as tim: docFreqs = docFreqFromCountMatrix(matrix) numDocs = float(len(matrix)) avgDocLen = sum(map(len, matrix)) / numDocs bmatrix = [] for row in matrix: docLen = len(row) brow = dict() for pair in row.iteritems(): termFreq = pair[1] docFreq = docFreqs[pair[0]] bm = bm25(termFreq, docFreq, docLen, avgDocLen, numDocs) #print '\t'.join(map(str, [termFreq, docFreq, docLen, avgDocLen, numDocs, bm])) brow[pair[0]] = bm bmatrix.append(brow) return bmatrix
def constructModel(oldJointData, nonzeroEntries, newEMarginal, newFMarginal, epsilon=1e-6): with timer("constr") as tim: model = grb.Model("model") # construct the variables and the objective e2f = {} f2e = {} newJointVars = {} obj = grb.QuadExpr() bb = {} for e_f in nonzeroEntries.iterkeys(): (e, f) = e_f v = model.addVar(0.0, 1.0, 0.0, grb.GRB.CONTINUOUS, f + "__" + e) if not e2f.has_key(e): e2f[e] = {} if not f2e.has_key(f): f2e[f] = {} e2f[e][f] = v f2e[f][e] = v newJointVars[e_f] = v if oldJointData.has_key(e_f): # objective should contain (v - OLD)^2 old = oldJointData[e_f] b = model.addVar(0.0, 1.0, 0.0, grb.GRB.CONTINUOUS, "b" + str(e_f)) obj += (v - old) * (v - old) obj += b bb[v] = (b, old) else: # objective should contain v^2 obj += 1.1 * v obj += v * v model.update() # add the variables before setting the objective model.setObjective(obj) # now create the constraints -- there are E and F marginal # constraints of the form: # sum_f newJoint[e,f] = newEMarginal[e] for all e # sum_e newJoint[e,f] = newFMarginal[f] for all f # this is too restrictive (maybe impossible), so we give # a little slack -- maybe this should be penalized in the # objective. anyway, we write: # sum_f newJoint[e,f] - newEMarginal[e] < epsilon for all e # sum_f newJoint[e,f] - newEMarginal[e] > -epsilon for all e # and similarly for newFMarginal for e, marg in newEMarginal.iteritems(): lhs = grb.LinExpr(-marg) for f, var in e2f[e].iteritems(): lhs += var model.addConstr(lhs <= epsilon, "ce+" + e) model.addConstr(lhs >= -epsilon, "ce-" + e) for f, marg in newFMarginal.iteritems(): lhs = grb.LinExpr(-marg) for e, var in f2e[f].iteritems(): lhs += var model.addConstr(lhs <= epsilon, "cf+" + e) model.addConstr(lhs >= -epsilon, "cf-" + e) counter = 0 for v, (b, old) in bb.iteritems(): model.addConstr(b >= v - old, "vb+" + str(counter)) model.addConstr(b >= -v + old, "vb-" + str(counter)) model.addConstr(b >= 0, "vb0" + str(counter)) counter += 1 # finalize the model model.update() return model
def docTermCountMatrix(fileList, mincount = 0): with timer('building termcount matrix') as tim: matrix = [] for filename in fileList: matrix.append(getUnigrams(filename, mincount)) return matrix