def rate_of_change_bigrams(self, words=None): if not self.aligned: print( '>> Rate of Change requires that the word2vec models have been aligned. Run align() first.' ) return if not words: words = self.mfw() def writegen(): for word in words: old = [] for m1, m2 in m1_m2s: if not word in m1.vocabset or not word in m2.vocabset: continue sim = m1.similarity(word, m2) dist = 1 - sim odx = { 'word': word, 'model1': m1.name, 'model2': m2.name, 'cosine_distance': dist } old += [odx] X = list(range(len(old))) Y = [dx['cosine_distance'] for dx in old] r, p = spearmanr(X, Y) for dx in old: dx['spearman_r'], dx['spearman_p'] = r, p yield dx tools.writegen('data.rate_of_change.txt', writegen)
def rate_of_change_cosine(self, words=None): if not self.aligned: print( '>> Rate of Change requires that the word2vec models have been aligned. Run align() first.' ) return if not words: words = self.mfw() def writegen(): for word in words: old = [] for i1, m1 in enumerate(self.models): for i2, m2 in enumerate(self.models): sim = m1.similarity(word, m2) dist = 1 - sim m1name = m1.name.replace(m1.corpus.name + '.', '') m2name = m2.name.replace(m2.corpus.name + '.', '') odx1 = { 'word': word, 'model1': m1name, 'model2': m2name, 'cosine_distance': dist } odx2 = { 'word': word, 'model1': m2name, 'model2': m1name, 'cosine_distance': dist } yield odx1 yield odx2 tools.writegen('data.rate_of_change.txt', writegen)
def gen_semantic_networks(self, k_core=None): """ Compare the semantic networks across the separate W2V models in this W2Vs object. """ name = self.corpus.name goog_url = SEMANTIC_NETWORK_GOOG_URLS[name] cluster_ld = tools.tsv2ld(goog_url) cluster_id2d = tools.ld2dd(cluster_ld, 'ID') node_ld = tools.tsv2ld( self.fn.replace('.graphml', '.analysis-with-modularity.txt')) id2ld = tools.ld2dld(node_ld, 'partition_id') def writegen(): pool = mp.Pool(processes=4) for model in self.models: model.mfw_d = self.mfw_d proc = [ pool.apply_async(do_semantic_network, args=(model, k_core)) for model in self.models ] for gen in proc: for dx in gen.get(): yield dx tools.writegen( 'word2vec.comparison.semantic_networks.' + self.name + '.txt', writegen)
def model_ranks(self, words=[], special=None, topn=50, periods=None, num_runs=None): models = self.models if not periods else [ m for m in self.models if m.period in periods ] models = models if not num_runs else [ m for m in models if m.run_num <= num_runs ] models = [m for m in models if m.exists] print('>> MODELS:', [m.name for m in models]) #return periods = set(periods) if periods else set(self.periods) special = KEYWORDS if not special else special if not words: words = list(self.mfw()) wordset = set(words) for x in special: if not x in wordset: words += [x] print( ">> getting ranks for {} words, where ranks are calculated against {} words..." .format(len(special), len(words))) #print words #return def writegen(): ww2periods = defaultdict(set) pool = mp.Pool() args = [(model, words, topn, special) for model in models] #for dx in (dx for res in pool.imap(do_model_rank,args) for dx in res): for ld in pool.imap(do_model_rank, args): for dx in ld: yield dx ww = (dx['word1'], dx['word2']) ww2periods[ww] |= {dx['model_name_2']} for ww in ww2periods: for missingperiod in periods - ww2periods[ww]: yield { 'word1': ww[0], 'word2': ww[1], 'closeness_rank': 666, 'closeness_cosine': 0, 'model_name_2': missingperiod } tools.writegen( 'data.word2vec.consolidated.ranks.{0}.txt'.format(self.name), writegen)
def model_ranks_lm(self, fn=None, max_rank=100): if not fn: fn = 'data.word2vec.consolidated.ranks.{0}.txt'.format(self.name) ## Build necessary data structure from collections import defaultdict wordpair2period2ranks = {} for d in tools.readgen(fn): wordpair = (d['word1'], d['word2']) period = d['model_name_2'] rank = float(d['closeness_rank']) if max_rank and rank > max_rank: continue if not wordpair in wordpair2period2ranks: wordpair2period2ranks[wordpair] = defaultdict(list) wordpair2period2ranks[wordpair][period] += [rank] def writegen(): numwordpairs = len(wordpair2period2ranks) for i, wordpair in enumerate(wordpair2period2ranks): if not i % 100: print('>>', i, numwordpairs, '...') X, Y = [], [] for period in sorted(wordpair2period2ranks[wordpair]): x = int(period.split('-')[0]) for y in wordpair2period2ranks[wordpair][period]: Y += [y] X += [x] #wordpair2period2ranks[wordpair][period]=np.median(wordpair2period2ranks[wordpair][period]) #Y+=[wordpair2period2ranks[wordpair][period]] #X=list(range(len(Y))) if len(set(X)) < 2: continue a, b, RR = tools.linreg(X, Y) pr, pp = pearsonr(X, Y) odx = {} odx['word1'], odx['word2'] = wordpair odx['num_periods'] = len(set(X)) odx['linreg_RR'] = RR odx['pearson_R'] = pr odx['pearson_P'] = pp odx['linreg_slope'] = a odx['rank_diff'] = np.mean(Y[-2:]) - np.mean(Y[:2]) yield odx tools.writegen(fn.replace('.txt', '.linreg.txt'), writegen)
def rate_of_change(self, words=None, topn=100): if not self.aligned: print( '>> Rate of Change requires that the word2vec models have been aligned. Run align() first.' ) return if not words: words = self.mfw() num_words = len(words) def writegen(): for i, word in enumerate(words): print('>>', num_words - i, word, '..') old = [] for i1, m1 in enumerate(self.models): for i2, m2 in enumerate(self.models): if i1 <= i2: continue ## jaccard with top N res1 = m1.similar(word, topn=topn) res2 = m2.similar(word, topn=topn) words1, csim1 = list(zip(*res1)) words2, csim2 = list(zip(*res2)) wordset1 = set(words1) wordset2 = set(words2) jacc = float(len(wordset1 & wordset2)) / float( len(wordset1 | wordset2)) ## spearman with all """ assert len(m1.vocabset) == len(m2.vocabset) vocsize=len(m1.vocabset) res1=m1.similar(word,topn=vocsize) res2=m2.similar(word,topn=vocsize) res1.sort() res2.sort() words1,csim1=zip(*res1) words2,csim2=zip(*res2) sp_r,sp_p = spearmanr(csim1,csim2) #""" sp_r, sp_p = None, None sim = m1.similarity(word, m2) dist = 1 - sim m1name = m1.name.replace(m1.corpus.name + '.', '') m2name = m2.name.replace(m2.corpus.name + '.', '') odx1 = { 'word': word, 'model1': m1name, 'model2': m2name, 'cosine_distance': dist, 'spearman_r': sp_r, 'spearman_p': sp_p, 'jaccard': jacc, 'words_only_in_model1': ', '.join(wordset1 - wordset2), 'words_only_in_model2': ', '.join(wordset2 - wordset1), 'is_keyword': word in KEYWORDS } odx2 = { 'word': word, 'model1': m2name, 'model2': m1name, 'cosine_distance': dist, 'spearman_r': sp_r, 'spearman_p': sp_p, 'jaccard': jacc, 'words_only_in_model1': ', '.join(wordset2 - wordset1), 'words_only_in_model2': ', '.join(wordset1 - wordset2), 'is_keyword': word in KEYWORDS } yield odx1 yield odx2 tools.writegen('data.rate_of_change.txt', writegen)