Exemplo n.º 1
0
from copy import deepcopy

def euclidean(wordVecs, ppDict, numIters):
	newWordVecs = deepcopy(wordVecs)
	wvVocab = set(newWordVecs.keys())
	loopVocab = wvVocab.intersection(set(ppDict.keys()))
	for it in range(numIters):
		#loop through every node also in ontology (otherwise just use data estimate)
		for word in loopVocab:
			wordNeighbours = set(ppDict[word]).intersection(wvVocab)
			numNeighbours = len(wordNeighbours)
			#no neighbours, pass - use data estimate
			if numNeighbours == 0:
				continue
			#NOTE: why such a high weight for data estimate?
			newVec = numNeighbours * wordVecs[word]
			#loop over neighbours and add to new vector (currently with weight 1)
			for ppWord in wordNeighbours:
				newVec += newWordVecs[ppWord]
			newWordVecs[word] = newVec/(2*numNeighbours)
	return newWordVecs
  
if __name__=='__main__':
	wordVecs = read_word_vecs(sys.argv[1])
	ppDict = read_ppdb(sys.argv[2], wordVecs)
	numIter = int(sys.argv[3])
	outFileName = sys.argv[4]
	
	''' Enrich the word vectors using ppdb and print the enriched vectors '''
	print_word_vecs(euclidean(wordVecs, ppDict, numIter), outFileName)
	
Exemplo n.º 2
0

def euclidean(wordVecs, ppDict, numIters):
    newWordVecs = deepcopy(wordVecs)
    wvVocab = set(newWordVecs.keys())
    loopVocab = wvVocab.intersection(set(ppDict.keys()))
    for it in range(numIters):
        #loop through every node also in ontology (otherwise just use data estimate)
        for word in loopVocab:
            wordNeighbours = set(ppDict[word]).intersection(wvVocab)
            numNeighbours = len(wordNeighbours)
            #no neighbours, pass - use data estimate
            if numNeighbours == 0:
                continue
            #NOTE: why such a high weight for data estimate?
            newVec = numNeighbours * wordVecs[word]
            #loop over neighbours and add to new vector (currently with weight 1)
            for ppWord in wordNeighbours:
                newVec += newWordVecs[ppWord]
            newWordVecs[word] = newVec / (2 * numNeighbours)
    return newWordVecs


if __name__ == '__main__':
    wordVecs = read_word_vecs(sys.argv[1])
    ppDict = read_ppdb(sys.argv[2], wordVecs)
    numIter = int(sys.argv[3])
    outFileName = sys.argv[4]
    ''' Enrich the word vectors using ppdb and print the enriched vectors '''
    print_word_vecs(euclidean(wordVecs, ppDict, numIter), outFileName)
Exemplo n.º 3
0
from ranking import spearmans_rho
from ranking import assign_ranks
from numpy.linalg import norm
from random import shuffle
from operator import itemgetter
''' Calculates the cosime sim between two numpy arrays '''


def cosine_sim(vec1, vec2):
    return vec1.dot(vec2) / (norm(vec1) * norm(vec2))


if __name__ == '__main__':
    wordVectorFile = sys.argv[1]
    DIR = sys.argv[2]
    wordVectors = read_word_vecs(wordVectorFile)
    print '================================================================================='
    print "%6s" % "Serial", "%20s" % "Dataset", "%15s" % "Num Quests", "%15s" % "Not found", "%15s" % "%"
    print '================================================================================='
    FILES = ['EN-ESL-50.txt', 'EN-RD-300.txt', 'EN-TOEFL-80.txt']

    for i, FILE in enumerate(FILES):
        targets = []
        mostSim = []
        candidates = []
        for l in open(DIR + FILE, 'r'):
            w = [c.strip() for c in l.strip().split('|')]
            targets.append(w[0])
            mostSim.append(w[1])
            shuffle(w[1:])
            candidates.append(w[1:])
from io import read_word_vecs
from ranking import spearmans_rho
from ranking import assign_ranks
from numpy.linalg import norm
from random import shuffle
from operator import itemgetter

''' Calculates the cosime sim between two numpy arrays '''
def cosine_sim(vec1, vec2):
	return vec1.dot(vec2)/(norm(vec1)*norm(vec2))
  
if __name__=='__main__':
	wordVectorFile = sys.argv[1]
	DIR = sys.argv[2]
	wordVectors = read_word_vecs(wordVectorFile)
	print '================================================================================='
	print "%6s" %"Serial", "%20s" % "Dataset", "%15s" % "Num Quests", "%15s" % "Not found", "%15s" % "%"
	print '================================================================================='
	FILES = ['EN-ESL-50.txt','EN-RD-300.txt', 'EN-TOEFL-80.txt']
	
	for i, FILE in enumerate(FILES):
		targets = []
		mostSim = []
		candidates = []
		for l in open(DIR+FILE,'r'):
			w = [c.strip() for c in l.strip().split('|')]
			targets.append(w[0])
			mostSim.append(w[1])
			shuffle(w[1:])
			candidates.append(w[1:])