예제 #1
0
def decideOnLabel(word, context, vectors, clusterCenters, expansionParam, rel, partVoc, jointVocCache, pathToExpansionCache, expansionCacheInfo, disambiguatedWords, labels):
	
	def getHistogram2(words, clusterCenters):

		# initiate empty histogram
		histogram = [0 for i in range(len(clusterCenters))]
		
		# for every word
		for vector in words:
			
			# get the word's vector				
			# compute the similarity with every cluster center
			sims = [cosine_similarity(vector, x) for x in clusterCenters]

			for i in xrange(len(sims)):
				histogram[i] += sims[i]

		# get the total count from the histogram
		total = float(sum(histogram))
		
		# normalize the histogram (every value between [0, 1])
		if total > 0:
			histogram = map(lambda x: x / total, histogram)
		#print histogram

		return  histogram
	
	# get the jointVocabulary
	if word not in jointVocCache:
		jointVocabulary = partVoc.intersection(set(rel[word].keys()))
		jointVocCache[word] = jointVocabulary
	else:
		jointVocabulary = jointVocCache[word]

	# open expansionsCache
	expansionCache = shelve.open(pathToExpansionCache + word + expansionCacheInfo)
	
	# get expanded context
	expandedContext = expandAndCleanContext(context, word, rel, expansionParam, jointVocabulary, expansionCache)
	# close expansionCache
	expansionCache.close()
	
	# get the indexCache (the right one)
	indexCache = dict()

	expandedContext = filter(lambda x: x  not in disambiguatedWords and x in jointVocabulary, expandedContext)
	
	# get the histogram for the context
	histogram = getHistogram(expandedContext, clusterCenters, vectors, indexCache)

	# now get the histograms for the different word vectors
	wordvectors = [vectors[word + "_" + label] for label in labels]
	histograms = [getHistogram2([vec], clusterCenters) for vec in wordvectors]

	sims = [cosine_similarity(hist, histogram) for hist in histograms]
	simsCorrected = [sim + 1 for sim in sims]
	
	probabilities = [x / float(sum(simsCorrected)) for x in simsCorrected]
	
	return wordvectors, probabilities
예제 #2
0
파일: palm.py 프로젝트: anoukv/coconut
def getHistogram(words, clusterCenters, vectors, indexCache):

	# initiate empty histogram
	histogram = [0 for i in range(len(clusterCenters))]
	
	# for every word
	for word in words:
		# if it was already seen, we can ask for the right index from cache
		if word in indexCache:
			sims = indexCache[word]
		# if not; 
		else:
			# get the word's vector
			vector = vectors[word]
			
			# compute the similarity with every cluster center
			sims = [cosine_similarity(vector, x) for x in clusterCenters]

			# cache the result
			indexCache[word] = sims

		# increase the histogram with 1 at the right index
		for i in xrange(len(sims)):
			histogram[i] += sims[i]
	# get the total count from the histogram
	total = float(sum(histogram))
	
	# normalize the histogram (every value between [0, 1])
	if total > 0:
		histogram = map(lambda x: x / total, histogram)
	#print histogram

	return  histogram
예제 #3
0
def getHistogram(words, clusterCenters, vectors, indexCache):

    # initiate empty histogram
    histogram = [0 for i in range(len(clusterCenters))]

    # for every word
    for word in words:
        # if it was already seen, we can ask for the right index from cache
        if word in indexCache:
            sims = indexCache[word]
        # if not;
        else:
            # get the word's vector
            vector = vectors[word]

            # compute the similarity with every cluster center
            sims = [cosine_similarity(vector, x) for x in clusterCenters]

            # cache the result
            indexCache[word] = sims

        # increase the histogram with 1 at the right index
        for i in xrange(len(sims)):
            histogram[i] += sims[i]
    # get the total count from the histogram
    total = float(sum(histogram))

    # normalize the histogram (every value between [0, 1])
    if total > 0:
        histogram = map(lambda x: x / total, histogram)
    #print histogram

    return histogram
예제 #4
0
def vector_similarity(cs, w1, w2, vectors):
    if w1 == word2:
        return 1.0
    if w1 in vectors and w2 in vectors:
        return cosine_similarity(vectors[w1], vectors[w2])
    else:
        return cs
예제 #5
0
def getLabel2(wordVector, expandedContext, vectors):
    bestSim = None
    bestWord = None
    expandedContext = filter(lambda x: x in vectors, expandedContext)
    for candidate in expandedContext:
        sim = cosine_similarity(vectors[candidate], wordVector)
        if sim > bestSim:
            bestSim = sim
            bestWord = candidate
    return bestWord, bestSim
예제 #6
0
def avgSimC(probs1, vecs1, probs2, vecs2):
	if len(probs1) != len(vecs1) or len(probs2) != len(vecs2):
		print "There is a serious problem!"
	
	summationResult = 0
	for i in xrange(len(probs1)):
		for j in xrange(len(probs2)):
			summationResult += (probs1[i] * probs2[j] * cosine_similarity(vecs1[i], vecs2[j]))
	summationResult = summationResult / float(len(probs1) * len(probs2))
	return summationResult
예제 #7
0
파일: palm.py 프로젝트: anoukv/coconut
def getLabel2(wordVector, expandedContext, vectors):
	bestSim = None
	bestWord = None
	expandedContext = filter(lambda x: x in vectors, expandedContext)
	for candidate in expandedContext:
		sim = cosine_similarity(vectors[candidate], wordVector)
		if sim > bestSim:
			bestSim = sim
			bestWord = candidate
	return bestWord, bestSim
예제 #8
0
def getLabel2(wordVector, expandedContext, vectors):
	bestSim = None
	bestWord = None

	for candidate in expandedContext:
		if candidate in vectors:
			sim = cosine_similarity(vectors[candidate], wordVector)
			if sim > bestSim:
				bestSim = sim
				bestWord = candidate
	return bestWord
예제 #9
0
def avgSimC(probs1, vecs1, probs2, vecs2):
    if len(probs1) != len(vecs1) or len(probs2) != len(vecs2):
        print "There is a serious problem!"

    summationResult = 0
    for i in xrange(len(probs1)):
        for j in xrange(len(probs2)):
            summationResult += (probs1[i] * probs2[j] *
                                cosine_similarity(vecs1[i], vecs2[j]))
    summationResult = summationResult / float(len(probs1) * len(probs2))
    return summationResult
예제 #10
0
def context_similarity(c1, c2, clusters):
	def get_vec_sim(clusters, context):
		def sim(c1, c2):
			s = 0
			for key in c2:
				if key in c1:
					s += c1[key] * c2[key]
			return s
		v = []
		for cluster in clusters:
			v.append(sim(cluster, context))
		if sum(v) == 0:
			return v
		return normalizeVec(v)
	return cosine_similarity(get_vec_sim(clusters, c1), get_vec_sim(clusters, c2))
예제 #11
0
파일: palm.py 프로젝트: anoukv/coconut
def getLabel3(wordRel, wordVector, expandedContext, vectors, jointVocabulary):
	
	bestWord = None
	bestScore = None
	
	expandedContext = filter(lambda x: x in jointVocabulary, expandedContext)
	
	for candidate in expandedContext:
		relScore = wordRel[candidate]
		sim = cosine_similarity(vectors[candidate], wordVector)
		score = (relScore + sim) / float(2)
		if score > bestScore:
			bestScore = score
			bestWord = candidate
	
	return bestWord, bestScore
예제 #12
0
def getLabel3(wordRel, wordVector, expandedContext, vectors, jointVocabulary):

    bestWord = None
    bestScore = None

    expandedContext = filter(lambda x: x in jointVocabulary, expandedContext)

    for candidate in expandedContext:
        relScore = wordRel[candidate]
        sim = cosine_similarity(vectors[candidate], wordVector)
        score = (relScore + sim) / float(2)
        if score > bestScore:
            bestScore = score
            bestWord = candidate

    return bestWord, bestScore
예제 #13
0
def context_similarity(c1, c2, clusters):
    def get_vec_sim(clusters, context):
        def sim(c1, c2):
            s = 0
            for key in c2:
                if key in c1:
                    s += c1[key] * c2[key]
            return s

        v = []
        for cluster in clusters:
            v.append(sim(cluster, context))
        if sum(v) == 0:
            return v
        return normalizeVec(v)

    return cosine_similarity(get_vec_sim(clusters, c1),
                             get_vec_sim(clusters, c2))
예제 #14
0
def dataCompression(data, vectors):
    def getSortedKeys(data):
        counts = [(key, len(data[key])) for key in data]
        counts = sorted(counts, key=lambda x: x[1], reverse=True)
        return map(lambda x: x[0], counts)

    while True:

        bestSim = None
        bestCandidate = None
        bestSubstitute = None

        keys = getSortedKeys(data)

        upper = keys[:len(keys) / 2]
        lower = keys[len(keys) / 2:]

        for candidate in lower:
            if candidate in vectors:
                for substitute in upper:
                    if substitute in vectors:
                        sim = cosine_similarity(vectors[candidate],
                                                vectors[substitute])

                        if sim > bestSim:
                            bestSim = sim
                            bestCandidate = candidate
                            bestSubstitute = substitute

        if bestSim < 0.5:
            break
        elif bestCandidate != None and bestSubstitute != None:
            print "Merging ", bestCandidate, " into ", bestSubstitute, " with sim: ", bestSim
            data[bestSubstitute] += data[bestCandidate]
            del data[bestCandidate]
    print "Keeping labels: (label, context count)"
    keys = getSortedKeys(data)
    for k in keys:
        print k, len(data[k])
    return data
예제 #15
0
파일: palm.py 프로젝트: anoukv/coconut
def dataCompression(data, vectors):

	def getSortedKeys(data):
		counts = [(key, len(data[key])) for key in data]
		counts = sorted(counts, key = lambda x : x[1], reverse = True)
		return map(lambda x: x[0], counts)

	while True:

		bestSim = None
		bestCandidate = None
		bestSubstitute = None

		keys = getSortedKeys(data)

		upper = keys[:len(keys)/2]
		lower = keys[len(keys)/2:]

		for candidate in lower:
			if candidate in vectors:
				for substitute in upper:
					if substitute in vectors:
						sim = cosine_similarity(vectors[candidate], vectors[substitute])

						if sim > bestSim:
							bestSim = sim 
							bestCandidate = candidate
							bestSubstitute = substitute

		if bestSim < 0.5:
			break
		elif bestCandidate != None and bestSubstitute != None:
			print "Merging ", bestCandidate, " into ", bestSubstitute, " with sim: ", bestSim
			data[bestSubstitute]+= data[bestCandidate]
			del data[bestCandidate]
	print "Keeping labels: (label, context count)"
	keys = getSortedKeys(data)
	for k in keys:
		print k, len(data[k])
	return data
예제 #16
0
    def getHistogram2(words, clusterCenters):

        # initiate empty histogram
        histogram = [0 for i in range(len(clusterCenters))]

        # for every word
        for vector in words:

            # get the word's vector
            # compute the similarity with every cluster center
            sims = [cosine_similarity(vector, x) for x in clusterCenters]

            for i in xrange(len(sims)):
                histogram[i] += sims[i]

        # get the total count from the histogram
        total = float(sum(histogram))

        # normalize the histogram (every value between [0, 1])
        if total > 0:
            histogram = map(lambda x: x / total, histogram)
        #print histogram

        return histogram
예제 #17
0
	def getHistogram2(words, clusterCenters):

		# initiate empty histogram
		histogram = [0 for i in range(len(clusterCenters))]
		
		# for every word
		for vector in words:
			
			# get the word's vector				
			# compute the similarity with every cluster center
			sims = [cosine_similarity(vector, x) for x in clusterCenters]

			for i in xrange(len(sims)):
				histogram[i] += sims[i]

		# get the total count from the histogram
		total = float(sum(histogram))
		
		# normalize the histogram (every value between [0, 1])
		if total > 0:
			histogram = map(lambda x: x / total, histogram)
		#print histogram

		return  histogram
예제 #18
0
            label = decideOnLabel(word2, context2, vectors, clusterCenters,
                                  expansion, rel, partVoc, jointVocCache,
                                  pathToExpansionCache, expansionCacheInfo,
                                  pathToSVMFile, svmFileInfo,
                                  disambiguatedWords)
            new = word2 + "_" + label
            if new in vectors:
                vec2 = vectors[word2 + "_" + label]
                w2 = True
        elif word2 in partVoc:
            vec2 = vectors[word2]
            w2 = True

        # only if both words have been found (somewhere), we continue
        if w1 and w2:
            s = cosine_similarity(vec1, vec2)
            v = vector_similarity(s, word1, word2, normalVectors)
            #print s, v
            score = s * v**3
            methodsRating.append(score)
            humanRating.append(question['rating'])
            if len(methodsRating) % 100 == 0 and len(methodsRating) > 0:
                print i, spearman(methodsRating, humanRating)
            done += 1

    # print the spearman correlation
    print spearman(methodsRating, humanRating)
    print "Coverage: ", done / float(len(questions)) * 100, "%"

    rel.close()
예제 #19
0
rel.close()

toBeShifted = []
for label in labels:
	if not label in vectors:
		toBeShifted.append(label)

labels = list(set(labels) - set(toBeShifted))

while len(labels) > 5:
	best1 = None
	best2 = None
	bestSim = None
	for label1 in labels:
		for label2 in labels:
			if label1 != label2:
				sim = cosine_similarity(vectors[label1], vectors[label2])
				if sim > bestSim:
					bestSim = sim
					best1 = label1
					best2 = label2
	if bestSim < 0.5:
		break
	keeper = getLabel2(vectors['bat'], [best1, best2], vectors)
	if keeper == best1:
		labels.remove(best2)
	else:
		labels.remove(best1)
	print bestSim, " Merging ", best1, " and " , best2, " into ", keeper

print labels
예제 #20
0
def vector_similarity(cs, w1, w2, vectors):
	if w1 in vectors and w2 in vectors:
		return cosine_similarity(vectors[w1], vectors[w2])
	else:
		return cs
예제 #21
0
		# if word2 has been disambiguated or is in vectors set finder to true

		if word2 in disambiguatedWords:
			label = decideOnLabel(word2, context2, vectors, clusterCenters, expansion, rel, partVoc, jointVocCache, pathToExpansionCache, expansionCacheInfo, pathToSVMFile, svmFileInfo, disambiguatedWords)
			new = word2 + "_" + label
			if new in vectors:
				vec2 = vectors[word2 + "_" + label]
				w2 = True
		elif word2 in partVoc:
			vec2 = vectors[word2]
			w2 = True


		# only if both words have been found (somewhere), we continue
		if w1 and w2:
			s = cosine_similarity(vec1, vec2)
			v = vector_similarity(s, word1, word2, normalVectors)
			#print s, v
			score = s * v**3
			methodsRating.append(score) 
			humanRating.append(question['rating'])
			if len(methodsRating) % 100 == 0 and len(methodsRating) > 0:
				print i, spearman(methodsRating, humanRating)
			done += 1

	# print the spearman correlation
 	print spearman(methodsRating, humanRating)
 	print "Coverage: ", done / float(len(questions)) * 100, "%"

 	rel.close()
예제 #22
0
		
		# if word2 has been disambiguated or is in vectors set finder to true

		if word2 in disambiguatedWords:
			label = decideOnLabel(word2, context2, vectors, clusterCenters, expansion, rel, partVoc, jointVocCache, pathToExpansionCache, expansionCacheInfo, pathToSVMFile, svmFileInfo, disambiguatedWords)
			new = word2 + "_" + label
			if new in vectors:
				vec21 = vectors[word2 + "_" + label]
				vec22 = normalVectors[word2]
				vec2 = getAverageWordRep2([vec21, vec22])
				w2 = True
		elif word2 in partVoc:
			vec2 = vectors[word2]
			w2 = True


		# only if both words have been found (somewhere), we continue
		if w1 and w2:
			methodsRating.append(cosine_similarity(vec1, vec2)) 
			humanRating.append(question['rating'])
			if len(methodsRating) % 100 == 0 and len(methodsRating) > 0:
				print i, spearman(methodsRating, humanRating)
			done += 1

	# print the spearman correlation
 	print spearman(methodsRating, humanRating)
 	print "Coverage: ", done / float(len(questions)) * 100, "%"

 	rel.close()

예제 #23
0
			label2 = decideOnLabel(word2, context2, vectors, clusterCenters, expansion, rel, partVoc, jointVocCache, pathToExpansionCache, expansionCacheInfo, pathToSVMFile, svmFileInfo, disambiguatedWords)
			new = word2 + "_" + label2
			if new in vectors:
				vec2 = vectors[word2 + "_" + label2]
				w2 = True
		elif word2 in partVoc:
			label2 = "not ambiguous"
			vec2 = vectors[word2]
			w2 = True


		# only if both words have been found (somewhere), we continue
		if w1 and w2:
			
			
			score = cosine_similarity(vec1, vec2)
			base = cosine_similarity(normalVectors[word1], normalVectors[word2])

			# if abs(base - score) > 0.3:
				# print question['word1'], label1
				# print question['context1']
				# print question['word2'], label2
				# print question['context2']
				# print i, "\tHuman average: ", question['rating']
				# print i, "\tPALM score", score 
				# print i, "\tBaseline: ", base
				# print i, "\tDifferent between base and palm: ", abs(base - score)

			methodsRating.append(score) 
			humanRating.append(question['rating'])
			otherRating.append(base)
예제 #24
0
				vec2 = vectors[word2 + "_" + label2]
				w2 = True
		elif word2 in partVoc:
			label2 = "not ambiguous"
			vec2 = vectors[word2]
			w2 = True


		# only if both words have been found (somewhere), we continue
		if w1 and w2:
			
			print question['word1'], label1
			print question['context1']
			print question['word2'], label2
			print question['context2']
			score = cosine_similarity(vec1, vec2)
			base = cosine_similarity(normalVectors[word1], normalVectors[word2])
			print "\tPALM score", score 
			print "\tHuman average: ", question['rating']
			print "\tBaseline: ", base
			print "\tDifferent between base and palm: ", abs(base - score)
			print
			print
			methodsRating.append(score) 
			humanRating.append(question['rating'])
			if len(methodsRating) % 100 == 0 and len(methodsRating) > 0:
				print i, spearman(methodsRating, humanRating)
			done += 1

	# print the spearman correlation
 	print spearman(methodsRating, humanRating)
예제 #25
0
def decideOnLabel(word, context, vectors, clusterCenters, expansionParam, rel,
                  partVoc, jointVocCache, pathToExpansionCache,
                  expansionCacheInfo, disambiguatedWords, labels):
    def getHistogram2(words, clusterCenters):

        # initiate empty histogram
        histogram = [0 for i in range(len(clusterCenters))]

        # for every word
        for vector in words:

            # get the word's vector
            # compute the similarity with every cluster center
            sims = [cosine_similarity(vector, x) for x in clusterCenters]

            for i in xrange(len(sims)):
                histogram[i] += sims[i]

        # get the total count from the histogram
        total = float(sum(histogram))

        # normalize the histogram (every value between [0, 1])
        if total > 0:
            histogram = map(lambda x: x / total, histogram)
        #print histogram

        return histogram

    # get the jointVocabulary
    if word not in jointVocCache:
        jointVocabulary = partVoc.intersection(set(rel[word].keys()))
        jointVocCache[word] = jointVocabulary
    else:
        jointVocabulary = jointVocCache[word]

    # open expansionsCache
    expansionCache = shelve.open(pathToExpansionCache + word +
                                 expansionCacheInfo)

    # get expanded context
    expandedContext = expandAndCleanContext(context, word, rel, expansionParam,
                                            jointVocabulary, expansionCache)
    # close expansionCache
    expansionCache.close()

    # get the indexCache (the right one)
    indexCache = dict()

    expandedContext = filter(
        lambda x: x not in disambiguatedWords and x in jointVocabulary,
        expandedContext)

    # get the histogram for the context
    histogram = getHistogram(expandedContext, clusterCenters, vectors,
                             indexCache)

    # now get the histograms for the different word vectors
    wordvectors = [vectors[word + "_" + label] for label in labels]
    histograms = [getHistogram2([vec], clusterCenters) for vec in wordvectors]

    sims = [cosine_similarity(hist, histogram) for hist in histograms]
    simsCorrected = [sim + 1 for sim in sims]

    probabilities = [x / float(sum(simsCorrected)) for x in simsCorrected]

    return wordvectors, probabilities