def decideOnLabel(word, context, vectors, clusterCenters, expansionParam, rel, partVoc, jointVocCache, pathToExpansionCache, expansionCacheInfo, disambiguatedWords, labels): def getHistogram2(words, clusterCenters): # initiate empty histogram histogram = [0 for i in range(len(clusterCenters))] # for every word for vector in words: # get the word's vector # compute the similarity with every cluster center sims = [cosine_similarity(vector, x) for x in clusterCenters] for i in xrange(len(sims)): histogram[i] += sims[i] # get the total count from the histogram total = float(sum(histogram)) # normalize the histogram (every value between [0, 1]) if total > 0: histogram = map(lambda x: x / total, histogram) #print histogram return histogram # get the jointVocabulary if word not in jointVocCache: jointVocabulary = partVoc.intersection(set(rel[word].keys())) jointVocCache[word] = jointVocabulary else: jointVocabulary = jointVocCache[word] # open expansionsCache expansionCache = shelve.open(pathToExpansionCache + word + expansionCacheInfo) # get expanded context expandedContext = expandAndCleanContext(context, word, rel, expansionParam, jointVocabulary, expansionCache) # close expansionCache expansionCache.close() # get the indexCache (the right one) indexCache = dict() expandedContext = filter(lambda x: x not in disambiguatedWords and x in jointVocabulary, expandedContext) # get the histogram for the context histogram = getHistogram(expandedContext, clusterCenters, vectors, indexCache) # now get the histograms for the different word vectors wordvectors = [vectors[word + "_" + label] for label in labels] histograms = [getHistogram2([vec], clusterCenters) for vec in wordvectors] sims = [cosine_similarity(hist, histogram) for hist in histograms] simsCorrected = [sim + 1 for sim in sims] probabilities = [x / float(sum(simsCorrected)) for x in simsCorrected] return wordvectors, probabilities
def getHistogram(words, clusterCenters, vectors, indexCache): # initiate empty histogram histogram = [0 for i in range(len(clusterCenters))] # for every word for word in words: # if it was already seen, we can ask for the right index from cache if word in indexCache: sims = indexCache[word] # if not; else: # get the word's vector vector = vectors[word] # compute the similarity with every cluster center sims = [cosine_similarity(vector, x) for x in clusterCenters] # cache the result indexCache[word] = sims # increase the histogram with 1 at the right index for i in xrange(len(sims)): histogram[i] += sims[i] # get the total count from the histogram total = float(sum(histogram)) # normalize the histogram (every value between [0, 1]) if total > 0: histogram = map(lambda x: x / total, histogram) #print histogram return histogram
def vector_similarity(cs, w1, w2, vectors): if w1 == word2: return 1.0 if w1 in vectors and w2 in vectors: return cosine_similarity(vectors[w1], vectors[w2]) else: return cs
def getLabel2(wordVector, expandedContext, vectors): bestSim = None bestWord = None expandedContext = filter(lambda x: x in vectors, expandedContext) for candidate in expandedContext: sim = cosine_similarity(vectors[candidate], wordVector) if sim > bestSim: bestSim = sim bestWord = candidate return bestWord, bestSim
def avgSimC(probs1, vecs1, probs2, vecs2): if len(probs1) != len(vecs1) or len(probs2) != len(vecs2): print "There is a serious problem!" summationResult = 0 for i in xrange(len(probs1)): for j in xrange(len(probs2)): summationResult += (probs1[i] * probs2[j] * cosine_similarity(vecs1[i], vecs2[j])) summationResult = summationResult / float(len(probs1) * len(probs2)) return summationResult
def getLabel2(wordVector, expandedContext, vectors): bestSim = None bestWord = None for candidate in expandedContext: if candidate in vectors: sim = cosine_similarity(vectors[candidate], wordVector) if sim > bestSim: bestSim = sim bestWord = candidate return bestWord
def context_similarity(c1, c2, clusters): def get_vec_sim(clusters, context): def sim(c1, c2): s = 0 for key in c2: if key in c1: s += c1[key] * c2[key] return s v = [] for cluster in clusters: v.append(sim(cluster, context)) if sum(v) == 0: return v return normalizeVec(v) return cosine_similarity(get_vec_sim(clusters, c1), get_vec_sim(clusters, c2))
def getLabel3(wordRel, wordVector, expandedContext, vectors, jointVocabulary): bestWord = None bestScore = None expandedContext = filter(lambda x: x in jointVocabulary, expandedContext) for candidate in expandedContext: relScore = wordRel[candidate] sim = cosine_similarity(vectors[candidate], wordVector) score = (relScore + sim) / float(2) if score > bestScore: bestScore = score bestWord = candidate return bestWord, bestScore
def dataCompression(data, vectors): def getSortedKeys(data): counts = [(key, len(data[key])) for key in data] counts = sorted(counts, key=lambda x: x[1], reverse=True) return map(lambda x: x[0], counts) while True: bestSim = None bestCandidate = None bestSubstitute = None keys = getSortedKeys(data) upper = keys[:len(keys) / 2] lower = keys[len(keys) / 2:] for candidate in lower: if candidate in vectors: for substitute in upper: if substitute in vectors: sim = cosine_similarity(vectors[candidate], vectors[substitute]) if sim > bestSim: bestSim = sim bestCandidate = candidate bestSubstitute = substitute if bestSim < 0.5: break elif bestCandidate != None and bestSubstitute != None: print "Merging ", bestCandidate, " into ", bestSubstitute, " with sim: ", bestSim data[bestSubstitute] += data[bestCandidate] del data[bestCandidate] print "Keeping labels: (label, context count)" keys = getSortedKeys(data) for k in keys: print k, len(data[k]) return data
def dataCompression(data, vectors): def getSortedKeys(data): counts = [(key, len(data[key])) for key in data] counts = sorted(counts, key = lambda x : x[1], reverse = True) return map(lambda x: x[0], counts) while True: bestSim = None bestCandidate = None bestSubstitute = None keys = getSortedKeys(data) upper = keys[:len(keys)/2] lower = keys[len(keys)/2:] for candidate in lower: if candidate in vectors: for substitute in upper: if substitute in vectors: sim = cosine_similarity(vectors[candidate], vectors[substitute]) if sim > bestSim: bestSim = sim bestCandidate = candidate bestSubstitute = substitute if bestSim < 0.5: break elif bestCandidate != None and bestSubstitute != None: print "Merging ", bestCandidate, " into ", bestSubstitute, " with sim: ", bestSim data[bestSubstitute]+= data[bestCandidate] del data[bestCandidate] print "Keeping labels: (label, context count)" keys = getSortedKeys(data) for k in keys: print k, len(data[k]) return data
def getHistogram2(words, clusterCenters): # initiate empty histogram histogram = [0 for i in range(len(clusterCenters))] # for every word for vector in words: # get the word's vector # compute the similarity with every cluster center sims = [cosine_similarity(vector, x) for x in clusterCenters] for i in xrange(len(sims)): histogram[i] += sims[i] # get the total count from the histogram total = float(sum(histogram)) # normalize the histogram (every value between [0, 1]) if total > 0: histogram = map(lambda x: x / total, histogram) #print histogram return histogram
label = decideOnLabel(word2, context2, vectors, clusterCenters, expansion, rel, partVoc, jointVocCache, pathToExpansionCache, expansionCacheInfo, pathToSVMFile, svmFileInfo, disambiguatedWords) new = word2 + "_" + label if new in vectors: vec2 = vectors[word2 + "_" + label] w2 = True elif word2 in partVoc: vec2 = vectors[word2] w2 = True # only if both words have been found (somewhere), we continue if w1 and w2: s = cosine_similarity(vec1, vec2) v = vector_similarity(s, word1, word2, normalVectors) #print s, v score = s * v**3 methodsRating.append(score) humanRating.append(question['rating']) if len(methodsRating) % 100 == 0 and len(methodsRating) > 0: print i, spearman(methodsRating, humanRating) done += 1 # print the spearman correlation print spearman(methodsRating, humanRating) print "Coverage: ", done / float(len(questions)) * 100, "%" rel.close()
rel.close() toBeShifted = [] for label in labels: if not label in vectors: toBeShifted.append(label) labels = list(set(labels) - set(toBeShifted)) while len(labels) > 5: best1 = None best2 = None bestSim = None for label1 in labels: for label2 in labels: if label1 != label2: sim = cosine_similarity(vectors[label1], vectors[label2]) if sim > bestSim: bestSim = sim best1 = label1 best2 = label2 if bestSim < 0.5: break keeper = getLabel2(vectors['bat'], [best1, best2], vectors) if keeper == best1: labels.remove(best2) else: labels.remove(best1) print bestSim, " Merging ", best1, " and " , best2, " into ", keeper print labels
def vector_similarity(cs, w1, w2, vectors): if w1 in vectors and w2 in vectors: return cosine_similarity(vectors[w1], vectors[w2]) else: return cs
# if word2 has been disambiguated or is in vectors set finder to true if word2 in disambiguatedWords: label = decideOnLabel(word2, context2, vectors, clusterCenters, expansion, rel, partVoc, jointVocCache, pathToExpansionCache, expansionCacheInfo, pathToSVMFile, svmFileInfo, disambiguatedWords) new = word2 + "_" + label if new in vectors: vec2 = vectors[word2 + "_" + label] w2 = True elif word2 in partVoc: vec2 = vectors[word2] w2 = True # only if both words have been found (somewhere), we continue if w1 and w2: s = cosine_similarity(vec1, vec2) v = vector_similarity(s, word1, word2, normalVectors) #print s, v score = s * v**3 methodsRating.append(score) humanRating.append(question['rating']) if len(methodsRating) % 100 == 0 and len(methodsRating) > 0: print i, spearman(methodsRating, humanRating) done += 1 # print the spearman correlation print spearman(methodsRating, humanRating) print "Coverage: ", done / float(len(questions)) * 100, "%" rel.close()
# if word2 has been disambiguated or is in vectors set finder to true if word2 in disambiguatedWords: label = decideOnLabel(word2, context2, vectors, clusterCenters, expansion, rel, partVoc, jointVocCache, pathToExpansionCache, expansionCacheInfo, pathToSVMFile, svmFileInfo, disambiguatedWords) new = word2 + "_" + label if new in vectors: vec21 = vectors[word2 + "_" + label] vec22 = normalVectors[word2] vec2 = getAverageWordRep2([vec21, vec22]) w2 = True elif word2 in partVoc: vec2 = vectors[word2] w2 = True # only if both words have been found (somewhere), we continue if w1 and w2: methodsRating.append(cosine_similarity(vec1, vec2)) humanRating.append(question['rating']) if len(methodsRating) % 100 == 0 and len(methodsRating) > 0: print i, spearman(methodsRating, humanRating) done += 1 # print the spearman correlation print spearman(methodsRating, humanRating) print "Coverage: ", done / float(len(questions)) * 100, "%" rel.close()
label2 = decideOnLabel(word2, context2, vectors, clusterCenters, expansion, rel, partVoc, jointVocCache, pathToExpansionCache, expansionCacheInfo, pathToSVMFile, svmFileInfo, disambiguatedWords) new = word2 + "_" + label2 if new in vectors: vec2 = vectors[word2 + "_" + label2] w2 = True elif word2 in partVoc: label2 = "not ambiguous" vec2 = vectors[word2] w2 = True # only if both words have been found (somewhere), we continue if w1 and w2: score = cosine_similarity(vec1, vec2) base = cosine_similarity(normalVectors[word1], normalVectors[word2]) # if abs(base - score) > 0.3: # print question['word1'], label1 # print question['context1'] # print question['word2'], label2 # print question['context2'] # print i, "\tHuman average: ", question['rating'] # print i, "\tPALM score", score # print i, "\tBaseline: ", base # print i, "\tDifferent between base and palm: ", abs(base - score) methodsRating.append(score) humanRating.append(question['rating']) otherRating.append(base)
vec2 = vectors[word2 + "_" + label2] w2 = True elif word2 in partVoc: label2 = "not ambiguous" vec2 = vectors[word2] w2 = True # only if both words have been found (somewhere), we continue if w1 and w2: print question['word1'], label1 print question['context1'] print question['word2'], label2 print question['context2'] score = cosine_similarity(vec1, vec2) base = cosine_similarity(normalVectors[word1], normalVectors[word2]) print "\tPALM score", score print "\tHuman average: ", question['rating'] print "\tBaseline: ", base print "\tDifferent between base and palm: ", abs(base - score) print print methodsRating.append(score) humanRating.append(question['rating']) if len(methodsRating) % 100 == 0 and len(methodsRating) > 0: print i, spearman(methodsRating, humanRating) done += 1 # print the spearman correlation print spearman(methodsRating, humanRating)
def decideOnLabel(word, context, vectors, clusterCenters, expansionParam, rel, partVoc, jointVocCache, pathToExpansionCache, expansionCacheInfo, disambiguatedWords, labels): def getHistogram2(words, clusterCenters): # initiate empty histogram histogram = [0 for i in range(len(clusterCenters))] # for every word for vector in words: # get the word's vector # compute the similarity with every cluster center sims = [cosine_similarity(vector, x) for x in clusterCenters] for i in xrange(len(sims)): histogram[i] += sims[i] # get the total count from the histogram total = float(sum(histogram)) # normalize the histogram (every value between [0, 1]) if total > 0: histogram = map(lambda x: x / total, histogram) #print histogram return histogram # get the jointVocabulary if word not in jointVocCache: jointVocabulary = partVoc.intersection(set(rel[word].keys())) jointVocCache[word] = jointVocabulary else: jointVocabulary = jointVocCache[word] # open expansionsCache expansionCache = shelve.open(pathToExpansionCache + word + expansionCacheInfo) # get expanded context expandedContext = expandAndCleanContext(context, word, rel, expansionParam, jointVocabulary, expansionCache) # close expansionCache expansionCache.close() # get the indexCache (the right one) indexCache = dict() expandedContext = filter( lambda x: x not in disambiguatedWords and x in jointVocabulary, expandedContext) # get the histogram for the context histogram = getHistogram(expandedContext, clusterCenters, vectors, indexCache) # now get the histograms for the different word vectors wordvectors = [vectors[word + "_" + label] for label in labels] histograms = [getHistogram2([vec], clusterCenters) for vec in wordvectors] sims = [cosine_similarity(hist, histogram) for hist in histograms] simsCorrected = [sim + 1 for sim in sims] probabilities = [x / float(sum(simsCorrected)) for x in simsCorrected] return wordvectors, probabilities