示例#1
0
    def rate_of_change_bigrams(self, words=None):
        if not self.aligned:
            print(
                '>> Rate of Change requires that the word2vec models have been aligned. Run align() first.'
            )
            return

        if not words: words = self.mfw()

        def writegen():
            for word in words:
                old = []
                for m1, m2 in m1_m2s:
                    if not word in m1.vocabset or not word in m2.vocabset:
                        continue
                    sim = m1.similarity(word, m2)
                    dist = 1 - sim
                    odx = {
                        'word': word,
                        'model1': m1.name,
                        'model2': m2.name,
                        'cosine_distance': dist
                    }
                    old += [odx]

                X = list(range(len(old)))
                Y = [dx['cosine_distance'] for dx in old]
                r, p = spearmanr(X, Y)
                for dx in old:
                    dx['spearman_r'], dx['spearman_p'] = r, p
                    yield dx

        tools.writegen('data.rate_of_change.txt', writegen)
示例#2
0
    def rate_of_change_cosine(self, words=None):
        if not self.aligned:
            print(
                '>> Rate of Change requires that the word2vec models have been aligned. Run align() first.'
            )
            return

        if not words: words = self.mfw()

        def writegen():
            for word in words:
                old = []
                for i1, m1 in enumerate(self.models):
                    for i2, m2 in enumerate(self.models):
                        sim = m1.similarity(word, m2)
                        dist = 1 - sim
                        m1name = m1.name.replace(m1.corpus.name + '.', '')
                        m2name = m2.name.replace(m2.corpus.name + '.', '')
                        odx1 = {
                            'word': word,
                            'model1': m1name,
                            'model2': m2name,
                            'cosine_distance': dist
                        }
                        odx2 = {
                            'word': word,
                            'model1': m2name,
                            'model2': m1name,
                            'cosine_distance': dist
                        }
                        yield odx1
                        yield odx2

        tools.writegen('data.rate_of_change.txt', writegen)
示例#3
0
    def gen_semantic_networks(self, k_core=None):
        """
		Compare the semantic networks across the separate W2V models in this W2Vs object.
		"""

        name = self.corpus.name
        goog_url = SEMANTIC_NETWORK_GOOG_URLS[name]
        cluster_ld = tools.tsv2ld(goog_url)
        cluster_id2d = tools.ld2dd(cluster_ld, 'ID')
        node_ld = tools.tsv2ld(
            self.fn.replace('.graphml', '.analysis-with-modularity.txt'))
        id2ld = tools.ld2dld(node_ld, 'partition_id')

        def writegen():
            pool = mp.Pool(processes=4)
            for model in self.models:
                model.mfw_d = self.mfw_d

            proc = [
                pool.apply_async(do_semantic_network, args=(model, k_core))
                for model in self.models
            ]
            for gen in proc:
                for dx in gen.get():
                    yield dx

        tools.writegen(
            'word2vec.comparison.semantic_networks.' + self.name + '.txt',
            writegen)
示例#4
0
    def model_ranks(self,
                    words=[],
                    special=None,
                    topn=50,
                    periods=None,
                    num_runs=None):
        models = self.models if not periods else [
            m for m in self.models if m.period in periods
        ]
        models = models if not num_runs else [
            m for m in models if m.run_num <= num_runs
        ]
        models = [m for m in models if m.exists]
        print('>> MODELS:', [m.name for m in models])
        #return
        periods = set(periods) if periods else set(self.periods)
        special = KEYWORDS if not special else special
        if not words:
            words = list(self.mfw())
            wordset = set(words)
            for x in special:
                if not x in wordset:
                    words += [x]

        print(
            ">> getting ranks for {} words, where ranks are calculated against {} words..."
            .format(len(special), len(words)))

        #print words
        #return

        def writegen():
            ww2periods = defaultdict(set)
            pool = mp.Pool()
            args = [(model, words, topn, special) for model in models]
            #for dx in (dx for res in pool.imap(do_model_rank,args) for dx in res):
            for ld in pool.imap(do_model_rank, args):
                for dx in ld:
                    yield dx
                    ww = (dx['word1'], dx['word2'])
                    ww2periods[ww] |= {dx['model_name_2']}
            for ww in ww2periods:
                for missingperiod in periods - ww2periods[ww]:
                    yield {
                        'word1': ww[0],
                        'word2': ww[1],
                        'closeness_rank': 666,
                        'closeness_cosine': 0,
                        'model_name_2': missingperiod
                    }

        tools.writegen(
            'data.word2vec.consolidated.ranks.{0}.txt'.format(self.name),
            writegen)
示例#5
0
    def model_ranks_lm(self, fn=None, max_rank=100):
        if not fn:
            fn = 'data.word2vec.consolidated.ranks.{0}.txt'.format(self.name)

        ## Build necessary data structure
        from collections import defaultdict
        wordpair2period2ranks = {}
        for d in tools.readgen(fn):
            wordpair = (d['word1'], d['word2'])
            period = d['model_name_2']
            rank = float(d['closeness_rank'])
            if max_rank and rank > max_rank: continue
            if not wordpair in wordpair2period2ranks:
                wordpair2period2ranks[wordpair] = defaultdict(list)
            wordpair2period2ranks[wordpair][period] += [rank]

        def writegen():
            numwordpairs = len(wordpair2period2ranks)
            for i, wordpair in enumerate(wordpair2period2ranks):
                if not i % 100:
                    print('>>', i, numwordpairs, '...')
                X, Y = [], []
                for period in sorted(wordpair2period2ranks[wordpair]):
                    x = int(period.split('-')[0])
                    for y in wordpair2period2ranks[wordpair][period]:
                        Y += [y]
                        X += [x]
                    #wordpair2period2ranks[wordpair][period]=np.median(wordpair2period2ranks[wordpair][period])
                    #Y+=[wordpair2period2ranks[wordpair][period]]
                #X=list(range(len(Y)))

                if len(set(X)) < 2: continue

                a, b, RR = tools.linreg(X, Y)
                pr, pp = pearsonr(X, Y)

                odx = {}
                odx['word1'], odx['word2'] = wordpair
                odx['num_periods'] = len(set(X))
                odx['linreg_RR'] = RR
                odx['pearson_R'] = pr
                odx['pearson_P'] = pp
                odx['linreg_slope'] = a
                odx['rank_diff'] = np.mean(Y[-2:]) - np.mean(Y[:2])
                yield odx

        tools.writegen(fn.replace('.txt', '.linreg.txt'), writegen)
示例#6
0
    def rate_of_change(self, words=None, topn=100):
        if not self.aligned:
            print(
                '>> Rate of Change requires that the word2vec models have been aligned. Run align() first.'
            )
            return

        if not words: words = self.mfw()
        num_words = len(words)

        def writegen():
            for i, word in enumerate(words):
                print('>>', num_words - i, word, '..')
                old = []
                for i1, m1 in enumerate(self.models):
                    for i2, m2 in enumerate(self.models):
                        if i1 <= i2: continue
                        ## jaccard with top N
                        res1 = m1.similar(word, topn=topn)
                        res2 = m2.similar(word, topn=topn)
                        words1, csim1 = list(zip(*res1))
                        words2, csim2 = list(zip(*res2))
                        wordset1 = set(words1)
                        wordset2 = set(words2)
                        jacc = float(len(wordset1 & wordset2)) / float(
                            len(wordset1 | wordset2))

                        ## spearman with all
                        """
						assert len(m1.vocabset) == len(m2.vocabset)
						vocsize=len(m1.vocabset)
						res1=m1.similar(word,topn=vocsize)
						res2=m2.similar(word,topn=vocsize)
						res1.sort()
						res2.sort()
						words1,csim1=zip(*res1)
						words2,csim2=zip(*res2)
						sp_r,sp_p = spearmanr(csim1,csim2)
						#"""
                        sp_r, sp_p = None, None

                        sim = m1.similarity(word, m2)
                        dist = 1 - sim
                        m1name = m1.name.replace(m1.corpus.name + '.', '')
                        m2name = m2.name.replace(m2.corpus.name + '.', '')
                        odx1 = {
                            'word': word,
                            'model1': m1name,
                            'model2': m2name,
                            'cosine_distance': dist,
                            'spearman_r': sp_r,
                            'spearman_p': sp_p,
                            'jaccard': jacc,
                            'words_only_in_model1':
                            ', '.join(wordset1 - wordset2),
                            'words_only_in_model2':
                            ', '.join(wordset2 - wordset1),
                            'is_keyword': word in KEYWORDS
                        }
                        odx2 = {
                            'word': word,
                            'model1': m2name,
                            'model2': m1name,
                            'cosine_distance': dist,
                            'spearman_r': sp_r,
                            'spearman_p': sp_p,
                            'jaccard': jacc,
                            'words_only_in_model1':
                            ', '.join(wordset2 - wordset1),
                            'words_only_in_model2':
                            ', '.join(wordset1 - wordset2),
                            'is_keyword': word in KEYWORDS
                        }
                        yield odx1
                        yield odx2

        tools.writegen('data.rate_of_change.txt', writegen)