def test_analogy(self): from wordvector import WordVector dictionary = { 'the': 0, 'quick': 1, 'brown': 2, 'fox': 3, 'jumped': 4, 'over': 5 } embed_matrix = np.array([[1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1], [1.0, 0.5, 0.1, 0.1, 0.1, 0.1, 0.1], [-1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1], [1.0, 0.1, 0.1, 1.1, 1.1, 1.1, 0.1], [1.0, 0.6, 0.1, 1.1, 1.1, 1.1, 0.1], [1.0, 0.7, 0.1, 1.1, 1.1, 1.1, 0.1]]) word_embedding = WordVector(embed_matrix, dictionary) d = word_embedding.analogy('the', 'fox', 'quick', num=2, metric='euclidean') self.assertEqual(2, len(d), 'wrong number of analogies returned') self.assertEqual('jumped', d[0], 'wrong most likely analogy returned') self.assertEqual('over', d[1], 'wrong 2nd most likely analogy returned')
def test_get_vector_by_num(self): from wordvector import WordVector dictionary = { 'the': 0, 'quick': 1, 'brown': 2, 'fox': 3, 'jumped': 4, 'over': 5 } embed_matrix = np.array([[1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1], [1.0, 0.05, 0.1, 0.1, 0.1, 0.1, 0.1], [-1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1], [1.0, 0.1, 0.1, 1.1, 1.1, 1.1, 0.1], [1.0, 0.1, 0.1, 0.9, 0.9, 0.9, 0.1], [1.0, 0.1, 0.1, 1.0, 1.0, 0.8, 0.1]]) word_embedding = WordVector(embed_matrix, dictionary) self.assertTrue( np.sum( np.abs( np.array([1.0, 0.1, 0.1, 1.1, 1.1, 1.1, 0.1]) - word_embedding.get_vector_by_num(3))) < 0.1, 'incorrest closest indices') self.assertTrue( np.sum( np.abs( np.array([1.0, 0.1, 0.1, 1.0, 1.0, 0.8, 0.1]) - word_embedding.get_vector_by_num(5))) < 0.1, 'incorrest closest indices') self.assertTrue( np.sum( np.abs( np.array([1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]) - word_embedding.get_vector_by_num(0))) < 0.1, 'incorrest closest indices')
def test_closest_row_indices(self): from wordvector import WordVector dictionary = { 'the': 0, 'quick': 1, 'brown': 2, 'fox': 3, 'jumped': 4, 'over': 5 } embed_matrix = np.array([[1.0, 1.01], [2.0, 2.0], [2.0, 2.1], [1.0, 0.0], [0, 1.01], [-1.0, 0.0]]) word_embedding = WordVector(embed_matrix, dictionary) dist_list = word_embedding.closest_row_indices(np.array([[2.0, 2.0]]), 3, 'euclidean') self.assertTrue( np.sum(np.abs(np.array([1, 2, 0]) - dist_list)) < 0.1, 'incorrest closest indices') dist_list = word_embedding.closest_row_indices(np.array([[2.0, 2.0]]), 3, 'cosine') self.assertTrue( np.sum(np.abs(np.array([1, 0, 2]) - dist_list)) < 0.1, 'incorrest closest indices') dist_list = word_embedding.closest_row_indices(np.array([[1.0, 1.0]]), 6, 'euclidean') self.assertTrue( np.sum(np.abs(np.array([0, 3, 4, 1, 2, 5]) - dist_list)) < 0.1, 'incorrest closest indices')
def test_gets(self): from wordvector import WordVector dictionary = { 'the': 0, 'quick': 1, 'brown': 2, 'fox': 3, 'jumped': 4, 'over': 5 } embed_matrix = np.array([[1.0, 1.01], [2.0, 2.0], [2.0, 2.1], [1.0, 0.0], [0, 1.01], [-1.0, 0.0]]) word_embedding = WordVector(embed_matrix, dictionary) d = word_embedding.get_dict() dr = word_embedding.get_reverse_dict() em = word_embedding.get_embed() d.pop('the') # mutate, check that copies were returned dr.pop(1) em[0, 0] = 10 d = word_embedding.get_dict() dr = word_embedding.get_reverse_dict() em = word_embedding.get_embed() self.assertEqual(6, len(d), 'wrong dictionary length') self.assertEqual(6, len(dr), 'wrong dictionary length') self.assertEqual(1.0, em[0, 0], 'wrong value in embed matrix') self.assertEqual(3, d['fox'], 'wrong value from dictionary') self.assertEqual('jumped', dr[4], 'wrong value from reverse dictionary')
def processsimline(self,line): featurelist=line.split('\t') matchobj = Thesaurus.wordposPATT.match(featurelist[0]) if matchobj: wordpos=(matchobj.group(1),matchobj.group(2)) else: print "Error with vector file matching "+featurelist[0] return #self.vectordict[wordpos]=WordVector(wordpos) #initialise WordVector in vector dictionary (word,pos)=wordpos add=True if self.filter: if word+"/"+pos in self.filterwords: add=True else: add=False if add: self.thisvector=WordVector(wordpos) featurelist.reverse() #reverse list so can pop features and scores off featurelist.pop() #take off last item which is word itself if Thesaurus.byblo: #no extra fields check=True else: self.thisvector.width=float(featurelist.pop()) self.thisvector.length=float(featurelist.pop()) self.updatesimvector(wordpos,featurelist) self.thisvector.topk(self.k) self.vectordict[wordpos]=self.thisvector #self.vectordict[wordpos].displaysims() self.updated+=1
def test_num_words(self): from wordvector import WordVector dictionary = { 'the': 0, 'quick': 1, 'brown': 2, 'fox': 3, 'jumped': 4, 'over': 5 } embed_matrix = np.array([[1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1], [1.0, 0.05, 0.1, 0.1, 0.1, 0.1, 0.1], [-1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1], [1.0, 0.1, 0.1, 1.1, 1.1, 1.1, 0.1], [1.0, 0.1, 0.1, 0.9, 0.9, 0.9, 0.1], [1.0, 0.1, 0.1, 1.0, 1.0, 0.8, 0.1]]) word_embedding = WordVector(embed_matrix, dictionary) self.assertEqual(6, word_embedding.num_words(), 'incorrect number of words')
def n_closest(self): from wordvector import WordVector dictionary = { 'the': 0, 'quick': 1, 'brown': 2, 'fox': 3, 'jumped': 4, 'over': 5 } embed_matrix = np.array([[1.0, 1.01], [2.0, 2.0], [2.0, 2.1], [1.0, 0.0], [0, 1.01], [-1.0, 0.0]]) word_embedding = WordVector(embed_matrix, dictionary) nc_list = word_embedding.n_closest('quick', 3, metric='euclidean') self.assertEqual(['quick', 'brown', 'the'], nc_list, 'wrong n-closest words returned') nc_list = word_embedding.n_closest('quick', 2, metric='cosine') self.assertEqual(['the', 'fox'], nc_list, 'wrong n-closest words returned')
def test_most_common(self): from wordvector import WordVector dictionary = { 'the': 0, 'quick': 1, 'brown': 2, 'fox': 3, 'jumped': 4, 'over': 5 } embed_matrix = np.array([[1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1], [1.0, 0.5, 0.1, 0.1, 0.1, 0.1, 0.1], [-1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1], [1.0, 0.1, 0.1, 1.1, 1.1, 1.1, 0.1], [1.0, 0.6, 0.1, 1.1, 1.1, 1.1, 0.1], [1.0, 0.7, 0.1, 1.1, 1.1, 1.1, 0.1]]) word_embedding = WordVector(embed_matrix, dictionary) mc_list = word_embedding.most_common(3) self.assertEqual(['the', 'quick', 'brown'], mc_list, 'wrong most common words returned') mc_list = word_embedding.most_common(1) self.assertEqual(['the'], mc_list, 'wrong most common words returned')
def test_project_2D_2(self): from wordvector import WordVector dictionary = { 'the': 0, 'quick': 1, 'brown': 2, 'fox': 3, 'jumped': 4, 'over': 5 } embed_matrix = np.array([[1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1], [1.0, 0.05, 0.1, 0.1, 0.1, 0.1, 0.1], [-1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1], [1.0, 0.1, 0.1, 1.1, 1.1, 1.1, 0.1], [1.0, 0.1, 0.1, 0.9, 0.9, 0.9, 0.1], [1.0, 0.1, 0.1, 1.0, 1.0, 0.8, 0.1]]) word_embedding = WordVector(embed_matrix, dictionary) proj, words = word_embedding.project_2d(0, 6) self.assertEqual((6, 2), proj.shape, 'incorrect projection array size returned') self.assertEqual('the', words[0], 'incorrect word at index 0') self.assertEqual('fox', words[3], 'incorrect word at index 3')
def load(): files = ['../data/adventures_of_sherlock_holmes.txt', '../data/hound_of_the_baskervilles.txt', '../data/sign_of_the_four.txt'] word_array, dictionary, num_lines, num_words = docload.build_word_array( files, vocab_size=50000, gutenberg=True) print('Document loaded and processed: {} lines, {} words.' .format(num_lines, num_words)) print('Building training set ...') x, y = WindowModel.build_training_set(word_array) # shuffle and split 10% validation data x_shuf, y_shuf = sklearn.utils.shuffle(x, y, random_state=0) split = round(x_shuf.shape[0]*0.9) x_val, y_val = (x_shuf[split:, :], y_shuf[split:, :]) x_train, y_train = (x[:split, :], y[:split, :]) print('Training set built.') graph_params = {'batch_size': 32, 'vocab_size': np.max(x)+1, 'embed_size': 64, 'hid_size': 64, 'neg_samples': 64, 'learn_rate': 0.01, 'momentum': 0.9, 'embed_noise': 0.1, 'hid_noise': 0.3, 'optimizer': 'Momentum'} model = WindowModel(graph_params) print('Model built. Vocab size = {}. Document length = {} words.' .format(np.max(x)+1, len(word_array))) print('Training ...') results = model.train(x_train, y_train, x_val, y_val, epochs=120, verbose=False) word_vector_embed = WordVector(results['embed_weights'], dictionary) word_vector_nce = WordVector(results['nce_weights'], dictionary)
def test_words_in_range(self): from wordvector import WordVector dictionary = { 'the': 0, 'quick': 1, 'brown': 2, 'fox': 3, 'jumped': 4, 'over': 5 } embed_matrix = np.array([[1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1], [1.0, 0.5, 0.1, 0.1, 0.1, 0.1, 0.1], [-1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1], [1.0, 0.1, 0.1, 1.1, 1.1, 1.1, 0.1], [1.0, 0.6, 0.1, 1.1, 1.1, 1.1, 0.1], [1.0, 0.7, 0.1, 1.1, 1.1, 1.1, 0.1]]) word_embedding = WordVector(embed_matrix, dictionary) range_list = word_embedding.words_in_range(3, 6) self.assertEqual(['fox', 'jumped', 'over'], range_list, 'wrong most common words returned') range_list = word_embedding.words_in_range(0, 2) self.assertEqual(['the', 'quick'], range_list, 'wrong most common words returned')
class Thesaurus: wordposPATT = re.compile('(.*)/(.)') #only first char of POS byblo = False # byblo neighbours file or appthes generated from vector file def __init__(self,vectorfilename,simcachefile,simcache,windows,k,adja,adjb,compress): self.vectorfilename=vectorfilename self.simcachefile=simcachefile self.simcache=simcache self.thisvector="" self.vectordict={} #dictionary of vectors self.allfeatures={} #dictionary of all feature dimensions self.updated=0 self.fkeys=[] #list (to be sorted) of all features to self.fk_idx={} #feature --> dimension self.dim=0 WordVector.windows=windows self.k=k self.adja=adja self.adjb=adjb self.filter=False self.filterwords=[] self.compress=compress #whether to generate sparse vector representation for efficient sim calcs def readvectors(self): if self.simcache: #don't bother reading in vectors - just need simcache same=True else: print"Reading vector file "+self.vectorfilename linesread=0 instream=open(self.vectorfilename,'r') for line in instream: self.processvectorline(line.rstrip()) linesread+=1 if (linesread%10000 == 0): print "Read "+str(linesread)+" lines and updated "+str(self.updated)+" vectors" sys.stdout.flush() print "Read "+str(linesread)+" lines and updated "+str(self.updated)+" vectors" instream.close() if self.compress: print "Compressing vector dictionary representation" self.makematrix() print "Finished sparse array generation" def processvectorline(self,line): featurelist=line.split('\t') matchobj = Thesaurus.wordposPATT.match(featurelist[0]) if matchobj: wordpos=(matchobj.group(1),matchobj.group(2)) else: print "Error with vector file matching "+featurelist[0] #this could be "__FILTERED" so ignore line and carry on return #if len(featurelist)>WordVector.dim: # WordVector.dim=len(featurelist) self.vectordict[wordpos]=WordVector(wordpos) #initialise WordVector in vector dictionary featurelist.reverse() #reverse list so can pop features and scores off featurelist.pop() #take off last item which is word itself self.updatevector(wordpos,featurelist) self.updated+=1 def updatevector(self,wordpos,featurelist): while(len(featurelist)>0): f=featurelist.pop() sc=featurelist.pop() added=self.vectordict[wordpos].addfeature(f,sc) if added: self.allfeatures[f]=1 self.vectordict[wordpos].length=pow(self.vectordict[wordpos].length2,0.5) def readsims(self): print"Reading sim file "+self.simcachefile linesread=0 instream=open(self.simcachefile,'r') for line in instream: self.processsimline(line.rstrip()) linesread+=1 if (linesread%1000 == 0): print "Read "+str(linesread)+" lines and updated "+str(self.updated)+" similarity vectors" sys.stdout.flush() #return self.topk(self.k) print "Read "+str(linesread)+" lines and updated "+str(self.updated)+" vectors" instream.close() def processsimline(self,line): featurelist=line.split('\t') matchobj = Thesaurus.wordposPATT.match(featurelist[0]) if matchobj: wordpos=(matchobj.group(1),matchobj.group(2)) else: print "Error with vector file matching "+featurelist[0] return #self.vectordict[wordpos]=WordVector(wordpos) #initialise WordVector in vector dictionary (word,pos)=wordpos add=True if self.filter: if word+"/"+pos in self.filterwords: add=True else: add=False if add: self.thisvector=WordVector(wordpos) featurelist.reverse() #reverse list so can pop features and scores off featurelist.pop() #take off last item which is word itself if Thesaurus.byblo: #no extra fields check=True else: self.thisvector.width=float(featurelist.pop()) self.thisvector.length=float(featurelist.pop()) self.updatesimvector(wordpos,featurelist) self.thisvector.topk(self.k) self.vectordict[wordpos]=self.thisvector #self.vectordict[wordpos].displaysims() self.updated+=1 def updatesimvector(self,wordpos,featurelist): while(len(featurelist)>0): f=featurelist.pop() sc=featurelist.pop() self.thisvector.allsims[f]=float(sc) def makematrix(self): self.fkeys =self.allfeatures.keys() self.fkeys.sort() for i in range(len(self.fkeys)): self.fk_idx[self.fkeys[i]] = i del self.fkeys del self.allfeatures self.dim=len(self.fk_idx) print "Dimensionality is "+ str(self.dim) update_params(self.dim,self.adja,self.adjb) self.makearrays() def makearrays(self): #need to convert a word vector which stores a dictionary of features into a sparse array based on fk_idx for wordvector in self.vectordict.values(): temparray = numpy.zeros(self.dim) for feature in wordvector.vector.keys(): col=self.fk_idx[feature] score=wordvector.vector[feature] # temparray[col]=score # print temparray wordvector.array = sparse.csr_matrix(temparray) #print wordvector.array.data # print "Converted "+wordvector.word+"/"+wordvector.pos def allpairssims(self,metric): if self.simcache: #read in from sim cache self.readsims() #outstream=open(self.simcachefile,'w') #for wordvectorA in self.vectordict.values(): # wordvectorA.outputsims(outstream) #outstream.close() else: outstream=open(self.simcachefile,'w') #compute all pairs sims and write sim cache done =0 for wordvectorA in self.vectordict.values(): wordvectorA.allsims={} for wordvectorB in self.vectordict.values(): if wordvectorA.equals(wordvectorB): #ignore same =True else: label = wordvectorB.word+"/"+wordvectorB.pos sim=wordvectorA.findsim(wordvectorB,metric) if sim<0: wordvectorA.debug=True wordvectorA.findsim(wordvectorB,metric) if sim>1: wordvectorA.debug=True wordvectorA.findsim(wordvectorB,metric) wordvectorA.allsims[label]=sim wordvectorA.outputtopk(outstream,self.k) done+=1 if done%100==0: print "Completed similarity calculations for "+str(done)+" words" #for wordvectorA in self.vectordict.values(): # wordvectorA.analyse() def outputsim(self,wordA,wordB,metric): sim =-1 if self.simcache: (wa,pa)=wordA if wordA in self.vectordict.keys(): (wb,pb)=wordB label=wb+"/"+pb if label in self.vectordict[wordA].allsims.keys(): sim = self.vectordict[wordA].allsims[label] print "Similarity between "+wa+"/"+pa+" and "+wb +"/"+pb+" is "+str(sim) else: print label + " not in neighbour set" else: print wa+"/"+pa+" not in dictionary" else: if wordA in self.vectordict.keys(): vectorA = self.vectordict[wordA] if wordB in self.vectordict.keys(): vectorB = self.vectordict[wordB] sim = vectorA.findsim(vectorB,metric) print "Similarity between "+vectorA.word+"/"+vectorA.pos+" and "+vectorB.word +"/"+vectorB.pos+" is "+str(sim) print "("+str(vectorA.width) + ", "+str(vectorB.width)+")" else: (word,pos)=wordB print word+"/"+pos +" not in dictionary" else: (word,pos)=wordA print word+"/"+pos +" not in dictionary" def topk(self,k): #retain top k neighbours for each word for thisvector in self.vectordict.values(): thisvector.topk(k) def topsim(self,sim): #retain similarities over sim threshold for thisvector in self.vectordict.values(): #print thisvector,sim thisvector.keeptopsim(sim) def displayneighs(self,word,k): if word in self.vectordict.keys(): vector=self.vectordict[word] vector.topk(k) vector.displaysims() else: (word,pos)=word print word+"/"+pos + " not in dictionary" def analyse(self): totaltop=0.0 totalavg=0.0 squaretop=0.0 squareavg=0.0 count=0 correlationx=[] correlationy1=[] correlationy2=[] totalsd = 0.0 squaresd=0.0 for wordvectorA in self.vectordict.values(): count+=1 totaltop+=wordvectorA.topsim squaretop+=wordvectorA.topsim*wordvectorA.topsim totalavg+=wordvectorA.avgsim squareavg+=wordvectorA.avgsim*wordvectorA.avgsim totalsd+=wordvectorA.sd squaresd+=wordvectorA.sd * wordvectorA.sd correlationx.append(float(wordvectorA.width)) correlationy1.append(float(wordvectorA.topsim)) correlationy2.append(float(wordvectorA.avgsim)) avgtop=totaltop/count sdtop=pow(squaretop/count - avgtop*avgtop,0.5) avgavg=totalavg/count sdavg=pow(squareavg/count-avgavg*avgavg,0.5) avgsd=totalsd/count sdsd=pow(squaresd/count-avgsd*avgsd,0.5) print "Top similarity: average = "+str(avgtop)+" sd = "+str(sdtop) print "average similarity: average = "+str(avgavg)+" sd = "+str(sdavg) print "SD similarity: average = "+str(avgsd)+" sd = "+str(sdsd) #print correlationx #print correlationy1 x=numpy.array(correlationx) y=numpy.array(correlationy1) #print x #print y thispoly= numpy.poly1d(numpy.polyfit(x,y,1)) pr=stats.spearmanr(x,y) mytitle="Regression line for width and top similarity" # self.showpoly(x,y,thispoly,mytitle,pr,1,1) print "SRCC for width and top similarity is "+str(pr[0])+" ("+str(pr[1])+")" print thispoly x=numpy.array(correlationx) y=numpy.array(correlationy2) thispoly= numpy.poly1d(numpy.polyfit(x,y,1)) pr=stats.spearmanr(x,y) mytitle="Regression line for width and average similarity" # self.showpoly(x,y,thispoly,mytitle,pr,1,1) print "SRCC for width and average similarity is "+str(pr[0])+" ("+str(pr[1])+")" print thispoly def showpoly(self,x,y,poly,title,pr,xl,yl): xp=numpy.linspace(0,xl,100) plt.plot(x,y,'.',xp,poly(xp),'-') plt.ylim(0,yl) plt.title(title) mytext1="srcc = "+str(pr[0]) mytext2="p = "+str(pr[1]) plt.text(0.05,yl*0.9,mytext1) plt.text(0.05,yl*0.8,mytext2) plt.show()