Exemplo n.º 1
0
	def rate_of_change_bigrams(self,words=None):
		if not self.aligned:
			print('>> Rate of Change requires that the word2vec models have been aligned. Run align() first.')
			return

		if not words: words=self.mfw()

		def writegen():
			for word in words:
				old=[]
				for m1,m2 in m1_m2s:
					if not word in m1.vocabset or not word in m2.vocabset: continue
					sim=m1.similarity(word,m2)
					dist=1-sim
					odx={'word':word, 'model1':m1.name, 'model2':m2.name, 'cosine_distance':dist}
					old+=[odx]

				X=list(range(len(old)))
				Y=[dx['cosine_distance'] for dx in old]
				r,p = spearmanr(X,Y)
				for dx in old:
					dx['spearman_r'],dx['spearman_p']=r,p
					yield dx


		tools.writegen('data.rate_of_change.txt', writegen)
Exemplo n.º 2
0
	def model_ranks(self,words=[],special=None,topn=50,periods=None,num_runs=None):
		models = self.models if not periods else [m for m in self.models if m.period in periods]
		models = models if not num_runs else [m for m in models if m.run_num<=num_runs]
		models = [m for m in models if m.exists]
		print('>> MODELS:',[m.name for m in models])
		#return
		periods=set(periods) if periods else set(self.periods)
		special = KEYWORDS if not special else special
		if not words:
			words=list(self.mfw())
			wordset=set(words)
			for x in special:
				if not x in wordset:
					words+=[x]

		print(">> getting ranks for {} words, where ranks are calculated against {} words...".format(len(special), len(words)))
		#print words
		#return

		def writegen():
			ww2periods=defaultdict(set)
			pool = mp.Pool()
			args = [(model,words,topn,special) for model in models]
			#for dx in (dx for res in pool.imap(do_model_rank,args) for dx in res):
			for ld in pool.imap(do_model_rank,args):
				for dx in ld:
					yield dx
					ww=(dx['word1'],dx['word2'])
					ww2periods[ww]|={dx['model_name_2']}
			for ww in ww2periods:
				for missingperiod in periods-ww2periods[ww]:
					yield {'word1':ww[0], 'word2':ww[1], 'closeness_rank':666, 'closeness_cosine':0, 'model_name_2':missingperiod}

		tools.writegen('data.word2vec.consolidated.ranks.{0}.txt'.format(self.name), writegen)
Exemplo n.º 3
0
	def rate_of_change(self,words=None,topn=100):
		if not self.aligned:
			print('>> Rate of Change requires that the word2vec models have been aligned. Run align() first.')
			return

		if not words: words=self.mfw()
		num_words=len(words)

		def writegen():
			for i,word in enumerate(words):
				print('>>',num_words-i,word,'..')
				old=[]
				for i1,m1 in enumerate(self.models):
					for i2,m2 in enumerate(self.models):
						if i1<=i2: continue
						## jaccard with top N
						res1=m1.similar(word,topn=topn)
						res2=m2.similar(word,topn=topn)
						words1,csim1=list(zip(*res1))
						words2,csim2=list(zip(*res2))
						wordset1=set(words1)
						wordset2=set(words2)
						jacc=float(len(wordset1 & wordset2)) / float(len(wordset1 | wordset2))

						## spearman with all
						"""
						assert len(m1.vocabset) == len(m2.vocabset)
						vocsize=len(m1.vocabset)
						res1=m1.similar(word,topn=vocsize)
						res2=m2.similar(word,topn=vocsize)
						res1.sort()
						res2.sort()
						words1,csim1=zip(*res1)
						words2,csim2=zip(*res2)
						sp_r,sp_p = spearmanr(csim1,csim2)
						#"""
						sp_r,sp_p=None,None

						sim=m1.similarity(word,m2)
						dist=1-sim
						m1name=m1.name.replace(m1.corpus.name+'.','')
						m2name=m2.name.replace(m2.corpus.name+'.','')
						odx1={'word':word, 'model1':m1name, 'model2':m2name, 'cosine_distance':dist, 'spearman_r':sp_r, 'spearman_p':sp_p, 'jaccard':jacc, 'words_only_in_model1':', '.join(wordset1-wordset2), 'words_only_in_model2':', '.join(wordset2-wordset1), 'is_keyword':word in KEYWORDS}
						odx2={'word':word, 'model1':m2name, 'model2':m1name, 'cosine_distance':dist, 'spearman_r':sp_r, 'spearman_p':sp_p, 'jaccard':jacc, 'words_only_in_model1':', '.join(wordset2-wordset1), 'words_only_in_model2':', '.join(wordset1-wordset2), 'is_keyword':word in KEYWORDS}
						yield odx1
						yield odx2


		tools.writegen('data.rate_of_change.txt', writegen)
Exemplo n.º 4
0
	def model_ranks_lm(self,fn=None,max_rank=100):
		if not fn: fn='data.word2vec.consolidated.ranks.{0}.txt'.format(self.name)

	    ## Build necessary data structure
		from collections import defaultdict
		wordpair2period2ranks={}
		for d in tools.readgen(fn):
			wordpair=(d['word1'],d['word2'])
			period=d['model_name_2']
			rank=float(d['closeness_rank'])
			if max_rank and rank>max_rank: continue
			if not wordpair in wordpair2period2ranks: wordpair2period2ranks[wordpair]=defaultdict(list)
			wordpair2period2ranks[wordpair][period]+=[rank]

		def writegen():
			numwordpairs=len(wordpair2period2ranks)
			for i,wordpair in enumerate(wordpair2period2ranks):
				if not i%100:
					print('>>',i,numwordpairs,'...')
				X,Y=[],[]
				for period in sorted(wordpair2period2ranks[wordpair]):
					x=int(period.split('-')[0])
					for y in wordpair2period2ranks[wordpair][period]:
						Y+=[y]
						X+=[x]
					#wordpair2period2ranks[wordpair][period]=np.median(wordpair2period2ranks[wordpair][period])
					#Y+=[wordpair2period2ranks[wordpair][period]]
				#X=list(range(len(Y)))

				if len(set(X))<2: continue

				a,b,RR = tools.linreg(X,Y)
				pr,pp=pearsonr(X,Y)

				odx={}
				odx['word1'],odx['word2']=wordpair
				odx['num_periods']=len(set(X))
				odx['linreg_RR']=RR
				odx['pearson_R']=pr
				odx['pearson_P']=pp
				odx['linreg_slope']=a
				odx['rank_diff']=np.mean(Y[-2:]) - np.mean(Y[:2])
				yield odx

		tools.writegen(fn.replace('.txt','.linreg.txt'), writegen)
Exemplo n.º 5
0
	def rate_of_change_cosine(self,words=None):
		if not self.aligned:
			print('>> Rate of Change requires that the word2vec models have been aligned. Run align() first.')
			return

		if not words: words=self.mfw()

		def writegen():
			for word in words:
				old=[]
				for i1,m1 in enumerate(self.models):
					for i2,m2 in enumerate(self.models):
						sim=m1.similarity(word,m2)
						dist=1-sim
						m1name=m1.name.replace(m1.corpus.name+'.','')
						m2name=m2.name.replace(m2.corpus.name+'.','')
						odx1={'word':word, 'model1':m1name, 'model2':m2name, 'cosine_distance':dist}
						odx2={'word':word, 'model1':m2name, 'model2':m1name, 'cosine_distance':dist}
						yield odx1
						yield odx2


		tools.writegen('data.rate_of_change.txt', writegen)
Exemplo n.º 6
0
	def gen_semantic_networks(self,k_core=None):
		"""
		Compare the semantic networks across the separate W2V models in this W2Vs object.
		"""

		name=self.corpus.name
		goog_url=SEMANTIC_NETWORK_GOOG_URLS[name]
		cluster_ld = tools.tsv2ld(goog_url)
		cluster_id2d=tools.ld2dd(cluster_ld,'ID')
		node_ld = tools.tsv2ld(self.fn.replace('.graphml','.analysis-with-modularity.txt'))
		id2ld =tools.ld2dld(node_ld,'partition_id')

		def writegen():
			pool=mp.Pool(processes=4)
			for model in self.models:
				model.mfw_d=self.mfw_d

			proc = [pool.apply_async(do_semantic_network, args=(model,k_core)) for model in self.models]
			for gen in proc:
				for dx in gen.get():
					yield dx

		tools.writegen('word2vec.comparison.semantic_networks.'+self.name+'.txt', writegen)