예제 #1
0
 def text_similarity(self, t1, t2):
     if self._model == 'tfidf':
         t1_vec = matutils.any2sparse(self.text2model(t1))
         t2_vec = matutils.any2sparse(self.text2model(t2))
         return matutils.cossim(t1_vec, t2_vec)
     else:
         t1_vec = matutils.any2sparse(self.text2model(t1))
         t2_vec = matutils.any2sparse(self.text2model(t2))
         return matutils.cossim(t1_vec, t2_vec)
예제 #2
0
파일: analysis.py 프로젝트: gsi-upm/sematch
 def text_similarity(self, t1, t2):
     if self._model == 'tfidf':
         t1_vec = matutils.any2sparse(self.text2model(t1))
         t2_vec = matutils.any2sparse(self.text2model(t2))
         return matutils.cossim(t1_vec, t2_vec)
     else:
         t1_vec = matutils.any2sparse(self.text2model(t1))
         t2_vec = matutils.any2sparse(self.text2model(t2))
         return matutils.cossim(t1_vec, t2_vec)
예제 #3
0
def make_scores_for_sample():
    doc2vec_model = doc2vec.Doc2Vec.load('doc2vec_weigths')
    logging.info('doc2vec loaded')
    tfidf_unigram_model = TfidfModel.load('tfidf_unigram')
    logging.info('tfidf unigram loaded')
    tfidf_bigram_model = TfidfModel.load('tfidf_bigram')
    logging.info('tfidf bigram loaded')
    d1 = corpora.Dictionary.load('./dict_1.gensim')
    logging.info('dict1 loaded')
    d2 = corpora.Dictionary.load('./dict_2.gensim')
    logging.info('dict2 loaded')
    queries = pd.read_csv('./queries_norm.tsv', sep='\t', header=None, index_col=0)
    sample = pd.read_csv('./sample.csv', sep=',').sort_values(by=['DocumentId'])
    with open('./submission.csv', 'w') as f:
        writer = csv.writer(f, delimiter=',')
        writer.writerow(['QueryId', 'DocumentId', 'Score'])
        for idx, row in tqdm(sample.iterrows()):
            query_id = row['QueryId']
            doc_id = row['DocumentId']
            doc2vec_score = doc2vec_model.docvecs.similarity('DOC_%d' % doc_id, 'QUERY_%d' % query_id)
            doc = get_doc(doc_id)
            query = str(queries.loc[query_id])
            doc_title = str(doc[1])
            doc_content = str(doc[2])

            doc_title_words = doc_title.split()
            doc_content_words = doc_content.split()
            query_words = query.split()

            doc_title_bigrams = d2.doc2bow(list(map(lambda x: '\t'.join(x), zip(doc_title_words[:-1], doc_title_words[1:]))))
            doc_content_bigrams = d2.doc2bow(list(map(lambda x: '\t'.join(x), zip(doc_content_words[:-1], doc_content_words[1:]))))
            query_bigrams = d2.doc2bow(list(map(lambda x: '\t'.join(x), zip(query_words[:-1], query_words[1:]))))

            doc_title_words = d1.doc2bow(doc_title_words)
            doc_content_words = d1.doc2bow(doc_content_words)
            query_words = d1.doc2bow(query_words)

            doc_title_words = tfidf_unigram_model[doc_title_words]
            doc_content_words = tfidf_unigram_model[doc_content_words]
            query_words = tfidf_unigram_model[query_words]

            doc_title_bigrams = tfidf_bigram_model[doc_title_bigrams]
            doc_content_bigrams = tfidf_bigram_model[doc_content_bigrams]
            query_bigrams = tfidf_bigram_model[query_bigrams]

            tfidf_title_score_uni = matutils.cossim(doc_title_words, query_words)
            tfidf_content_score_uni = matutils.cossim(doc_content_words, query_words)
            tfidf_title_score_bi = matutils.cossim(doc_title_bigrams, query_bigrams)
            tfidf_content_score_bi = matutils.cossim(doc_content_bigrams, query_bigrams)

            score = (2 * tfidf_content_score_bi + 2 * tfidf_title_score_uni + tfidf_content_score_uni + 0.5 * doc2vec_score) / 5.5
            writer.writerow([query_id, doc_id, score])
예제 #4
0
    def similarity(self, t, extraction_pattern):
        (bef, bet, aft) = (0, 0, 0)

        if t.bef_vector is not None:
            bef = cossim(t.bef_vector, extraction_pattern.centroid_bef)

        if t.bet_vector is not None:
            bet = cossim(t.bet_vector, extraction_pattern.centroid_bet)

        if t.aft_vector is not None:
            aft = cossim(t.aft_vector, extraction_pattern.centroid_aft)

        return self.config.alpha * bef + self.config.beta * bet + self.config.gamma * aft
예제 #5
0
    def similarity(self, t, extraction_pattern):

        (bef, bet, aft) = (0, 0, 0)

        if t.bef_vector is not None and extraction_pattern.centroid_bef is not None:
            bef = cossim(t.bef_vector, extraction_pattern.centroid_bef)

        if t.bet_vector is not None and extraction_pattern.centroid_bet is not None:
            bet = cossim(t.bet_vector, extraction_pattern.centroid_bet)

        if t.aft_vector is not None and extraction_pattern.centroid_aft is not None:
            aft = cossim(t.aft_vector, extraction_pattern.centroid_aft)

        return self.config.alpha*bef + self.config.beta*bet + self.config.gamma*aft
예제 #6
0
def test(file_name):
    dictionary = corpora.Dictionary.load('./temp_tfidf/temp_dict')
    corpus = corpora.MmCorpus('./temp_tfidf/temp_mm')
    tfidf = models.TfidfModel.load('./temp_tfidf/tfidf_value')
    tags = load_tags('./temp_tfidf/tags')

    with open(file_name, 'r') as f, open('./result/tfidf_res', 'w') as outf:
        corpus_tfidf = tfidf[corpus]
        for line in f:
            items = line.decode('gbk').strip().split('\t')
            if len(items) != 2:
                raise Exception('error')
            qes = items[1].split(' ')
            new_vec = dictionary.doc2bow(qes)
            new_tfidf = tfidf[new_vec]
            h = []
            k = 10
            cnt = 0
            for dic in corpus_tfidf:
                s = matutils.cossim(new_tfidf, dic)
                heapq.heappush(h, (s, cnt))
                if len(h) > k:
                    heapq.heappop(h)
                cnt += 1
            candidate = '&&'.join(['%s:%s' % (tags[i], s) for (s, i) in h])
            outf.write('%s\t%s\n' %
                       (items[0].encode('gbk'), candidate.encode('gbk')))
예제 #7
0
    def score(query, profile, data=None):
        if not len(profile.description):
            return [-1]

        vectorspace = VectorSpace([])

        tokenized_description = LowerTokenizer.tokenize(profile.description)
        description_vector = vectorspace.vector_for_document(
            tokenized_document=tokenized_description,
            update=True)

        ddg_description = DuckDuckDescription.query(query.lower())

        ddg_vector = []
        if ddg_description:
            ddg_text = ddg_description['description']['text']
            ddg_tokenized = LowerTokenizer.tokenize(ddg_text)
            ddg_vector = vectorspace.vector_for_document(
                tokenized_document=ddg_tokenized,
                update=True)

        if not len(ddg_vector):
            return [-1]

        return [cossim(description_vector, ddg_vector)]
예제 #8
0
    def order_by_tf_id_rank(self, headline, sentences, number_of_sentences):
        headline_bow = self.vocab.doc2bow(sent2stokens_wostop(headline))
        headline_tfidf = self.tfidf_model[headline_bow]

        scored_sentences = []
        'Replace newlines with blank, since the punkt tokenizer does not recognize .[newline]'
        #sentences = sentences.replace('\n', ' ')

        for sentence in self.tokenizer.tokenize(sentences):
            sentence_tfidf = self.vocab.doc2bow(sent2stokens_wostop(sentence))
            sim = cossim(headline_tfidf, sentence_tfidf)
            #print(str(sim))
            scored_sentences.append([sentence, sim])

        sorted_sentences= sorted(scored_sentences, key=lambda scored_sentences: scored_sentences[1], reverse= True)
        '''
        for sentence in sorted_sentences:
        print(str(sentence))
        '''
        ' return sorted_sentences '

        sentences_string = ""
        current_sentence_number = 0
        for sentence in sorted_sentences:
            current_sentence_number += 1
            sentences_string += sentence[0] + ' '
            if current_sentence_number == number_of_sentences:
                break
        #print("Ranked: \n " + sentences_string)
        return sentences_string
예제 #9
0
def score(tweet, webpage):        
    lda = ldamodel.get_lda()    
    dictionary = ldamodel.get_dictionary()    
    tweet_vec = lda[dictionary.doc2bow(tweet['terms'])]  
    news_vec = cached_news_vector(webpage["content"].encode("utf-8"))
    score = matutils.cossim(news_vec, tweet_vec)                
    return score
예제 #10
0
    def test_lee(self):
        """correlation with human data > 0.6
        (this is the value which was achieved in the original paper)
        """

        global bg_corpus, corpus

        # create a dictionary and corpus (bag of words)
        dictionary = corpora.Dictionary(bg_corpus)
        bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus]
        corpus = [dictionary.doc2bow(text) for text in corpus]

        # transform the bag of words with log_entropy normalization
        log_ent = models.LogEntropyModel(bg_corpus)
        bg_corpus_ent = log_ent[bg_corpus]

        # initialize an LSI transformation from background corpus
        lsi = models.LsiModel(bg_corpus_ent, id2word=dictionary, num_topics=200)
        # transform small corpus to lsi bow->log_ent->fold-in-lsi
        corpus_lsi = lsi[log_ent[corpus]]

        # compute pairwise similarity matrix and extract upper triangular
        res = np.zeros((len(corpus), len(corpus)))
        for i, par1 in enumerate(corpus_lsi):
            for j, par2 in enumerate(corpus_lsi):
                res[i, j] = matutils.cossim(par1, par2)
        flat = res[np.triu_indices(len(corpus), 1)]

        cor = np.corrcoef(flat, human_sim_vector)[0, 1]
        logging.info("LSI correlation coefficient is %s", cor)
        self.assertTrue(cor > 0.6)
예제 #11
0
def ComputerSimilarity(model, corpus, blocks):
	print blocks;
	print(s);
	vectors = [];
	##There are 2 blocks, the one before the sentence and the one after

	for block in blocks:
		block_lda = model[corpus.dictionary.doc2bow(corpus.proc(block))];
		topics = np.asarray(block_lda);
		totalWeight = topics.sum(axis=0)[1]
		print(topics);
		print('---')
		#Generate words, a dictionary that represents a vector that is the normalized combination of all the topics
		words = {};

		for row in topics:
			weight = row[1]/totalWeight;
			topicID = int(row[0]);
			term_list = model.get_topic_terms(topicID,T)
			for word_weight in term_list:
				word_id = word_weight[0];
				word_n = word_weight[1];
				if words.has_key(word_id):
					words[word_id] = words[word_id] + word_n * weight;
				else:
					words[word_id] = word_n * weight;
		aggregate_vector = words.items();
		vectors.append(aggregate_vector);

	dot_product = matutils.cossim(vectors[0],vectors[1])
	return dot_product
    def sim_matrix(self, _topics2cousines):
        """
        Return two similarities matrix
        @params:
            _topics2cousines    - Required  : list of topics vectors for list of cuisines (list of list of floats)
        """

        _cuisine_matrix_e = []
        _cuisine_matrix_c = []

        for i, doc_a in enumerate(_topics2cousines):

            doc_a = doc_a[1]

            sim_vecs_e = []
            sim_vecs_c = []

            for j, doc_b in enumerate(_topics2cousines):
                doc_b = doc_b[1]

                w_sum_cs = matutils.cossim(doc_a, doc_b)
                w_sum_ed = 1 - self.euclidean_distance(list(doc_a), list(doc_b))

                if w_sum_ed < 0:
                    w_sum_ed = -1 * w_sum_ed

                sim_vecs_e.append(w_sum_ed)
                sim_vecs_c.append(w_sum_cs)

            _cuisine_matrix_e.append([_topics2cousines[i][0], sim_vecs_e])
            _cuisine_matrix_c.append([_topics2cousines[i][0], sim_vecs_c])

        return _cuisine_matrix_e, _cuisine_matrix_c
예제 #13
0
    def test_lee(self):
        """correlation with human data > 0.6
        (this is the value which was achieved in the original paper)
        """

        global bg_corpus, corpus

        # create a dictionary and corpus (bag of words)
        dictionary = corpora.Dictionary(bg_corpus)
        bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus]
        corpus = [dictionary.doc2bow(text) for text in corpus]

        # transform the bag of words with log_entropy normalization
        log_ent = models.LogEntropyModel(bg_corpus)
        bg_corpus_ent = log_ent[bg_corpus]

        # initialize an LSI transformation from background corpus
        lsi = models.LsiModel(bg_corpus_ent, id2word=dictionary, num_topics=200)
        # transform small corpus to lsi bow->log_ent->fold-in-lsi
        corpus_lsi = lsi[log_ent[corpus]]

        # compute pairwise similarity matrix and extract upper triangular
        res = np.zeros((len(corpus), len(corpus)))
        for i, par1 in enumerate(corpus_lsi):
            for j, par2 in enumerate(corpus_lsi):
                res[i, j] = matutils.cossim(par1, par2)
        flat = res[matutils.triu_indices(len(corpus), 1)]

        cor = np.corrcoef(flat, human_sim_vector)[0, 1]
        logging.info("LSI correlation coefficient is %s" % cor)
        self.assertTrue(cor > 0.6)
def cosine_similarity(topics, segmented_topics, per_topic_postings, measure, gamma, num_docs):
    """
    This function calculates the indirect cosine measure. Given context vectors
    _   _         _   _
    u = V(W') and w = V(W*) for the word sets of a pair S_i = (W', W*) indirect
                                                                _     _
    cosine measure is computed as the cosine similarity between u and w.

    Args:
    ----
    topics : Topics obtained from the trained topic model.
    segmented_topics : segmented_topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples.
    per_topic_postings : per_topic_postings : Output from the probability_estimation module. Is a dictionary of the posting list of all topics.
    measure : String. Direct confirmation measure to be used. Supported values are "nlr" (normalized log ratio).
    gamma : Gamma value for computing W', W* vectors.
    num_docs : Total number of documents in corresponding corpus.
    """
    if measure == 'nlr':
        measure = direct_confirmation_measure.normalized_log_ratio_measure
    else:
        raise ValueError("The direct confirmation measure you entered is not currently supported.")
    backtrack = {}
    s_cos_sim = []
    for top_words, s_i in zip(topics, segmented_topics):
        for w_prime, w_star in s_i:
            w_prime_context_vectors, backtrack_i = _make_seg(w_prime, top_words, per_topic_postings, measure, gamma, backtrack, num_docs)
            backtrack.update(backtrack_i)
            w_star_context_vectors, backtrack_i = _make_seg(w_star, top_words, per_topic_postings, measure, gamma, backtrack, num_docs)
            backtrack.update(backtrack_i)
            s_cos_sim_i = cossim(w_prime_context_vectors.items(), w_star_context_vectors.items())
            s_cos_sim.append(s_cos_sim_i)

    return s_cos_sim
    def order_by_tf_id_rank(self, headline, sentences, number_of_sentences):
        headline_bow = self.vocab.doc2bow(sent2stokens_wostop(headline))
        headline_tfidf = self.tfidf_model[headline_bow]

        scored_sentences = []
        'Replace newlines with blank, since the punkt tokenizer does not recognize .[newline]'
        #sentences = sentences.replace('\n', ' ')

        for sentence in self.tokenizer.tokenize(sentences):
            sentence_tfidf = self.vocab.doc2bow(sent2stokens_wostop(sentence))
            sim = cossim(headline_tfidf, sentence_tfidf)
            #print(str(sim))
            scored_sentences.append([sentence, sim])

        sorted_sentences = sorted(
            scored_sentences,
            key=lambda scored_sentences: scored_sentences[1],
            reverse=True)
        '''
        for sentence in sorted_sentences:
        print(str(sentence))
        '''
        ' return sorted_sentences '

        sentences_string = ""
        current_sentence_number = 0
        for sentence in sorted_sentences:
            current_sentence_number += 1
            sentences_string += sentence[0] + ' '
            if current_sentence_number == number_of_sentences:
                break
        #print("Ranked: \n " + sentences_string)
        return sentences_string
def calculateCoherence(config):

    lsaModel = models.LsiModel.load(config['LSA']['modelFileLocation'])

    dictionary = corpora.Dictionary.load(
        config['corpus']['corpusFolderLocation'] + 'corpus.dict')

    corpus = corpora.MmCorpus(config['corpus']['corpusFolderLocation'] +
                              'corpus.mm')

    tfidf = models.TfidfModel(corpus)

    for dataset in config['corpus']['datasets']:

        for transcriptPath in os.listdir(dataset['path']):
            document = loadFileIntoList(dataset['path'] + "/" + transcriptPath,
                                        dataset['transcriptColumn'],
                                        dataset['delimiter'],
                                        dataset['rowsToSkip'],
                                        config['corpus']['sentenceSplitting'],
                                        config['corpus']['stopwords'])

            with open(
                    config['coherence']['outputFolderLocation'] +
                    transcriptPath + "_results_lsa.tsv", 'w') as outputFile:

                # write header
                outputFile.write(
                    "coherence to previous sentence(s)\tpreprocessed sentence\tfull sentence\tcorresponding turn\n"
                )

                lastSentencesLSAList = []
                for sentence in document:
                    if " ".join(sentence[0]):

                        sentenceBow = dictionary.doc2bow(sentence[0])

                        weightIndex = 1
                        simLSA = 0
                        weightNormalizer = 0
                        for el in lastSentencesLSAList:
                            simLSA += 1 / weightIndex * matutils.cossim(
                                lsaModel[tfidf[sentenceBow]], el)
                            weightNormalizer += 1 / weightIndex
                            weightIndex += 1

                        if weightNormalizer > 0:
                            simLSA /= weightNormalizer

                        lastSentencesLSAList.insert(
                            0, lsaModel[tfidf[sentenceBow]])

                        if len(lastSentencesLSAList
                               ) > config['coherence']['slidingWindow']:
                            del lastSentencesLSAList[-1]

                        outputFile.write(str(simLSA) + "\t")
                        outputFile.write(" ".join(sentence[0]) + "\t")
                        outputFile.write(" ".join(sentence[1]) + "\t")
                        outputFile.write(sentence[2] + "\n")
예제 #17
0
def calc_similarity_score(tokens1, tokens2):
    '''
    Calculate a similarity score comparing tokens1 and tokens2 using cosine similarity.
    '''
    corpus1 = get_corpus(tokens1)
    corpus2 = get_corpus(tokens2)
    return matutils.cossim(corpus1, corpus2)
예제 #18
0
    def _update_pairwise_similarity(self, topic_id, content, date):
        """
        updates similarity data within the corpus
        """
        bow = self.dictionary.doc2bow(content)
        for tid, data in self.data.items():
            day_delta = (int(date) - int(data['date'])) / NUM_SECONDS_PER_DAY
            time_factor = math.pow(self.time_decay, day_delta)
            if tid == topic_id:
                continue
            bow1 = self.dictionary.doc2bow(data['body'])
            sim = matutils.cossim(bow, bow1)
            sim_1 = sim * min(1.0, 1 / time_factor)
            sim_2 = sim * min(1.0, time_factor)

            if self.irrelevant_thresh <= sim_1 <= self.duplicate_thresh:
                del_id = insert(data['sim_list'], topic_id, sim_1,
                                self.max_recoms)
                if del_id is not None:
                    self.data[topic_id]['appears_in'].append(tid)
                    self.data[tid]['updated'] = True
                    if del_id != '':
                        remove(self.data[del_id]['appears_in'], tid)

            if self.irrelevant_thresh <= sim_2 <= self.duplicate_thresh:
                del_id = insert(self.data[topic_id]['sim_list'], tid, sim_2,
                                self.max_recoms)
                if del_id is not None:
                    self.data[tid]['appears_in'].append(topic_id)
                    if del_id != '':
                        remove(self.data[del_id]['appears_in'], topic_id)
예제 #19
0
def calculate_similarity(vec_sentence1, vec_sentence2, network_type):
    embeddings = ['d2v', 'gd2v', 'fastT', 'gloVe', 's2v']
    if network_type == 'tfidf':
        return matutils.cossim(vec_sentence1, vec_sentence2)
    #if network_type=='d2v' or network_type=='gd2v':
    if network_type in embeddings:
        return 1 - spatial.distance.cosine(vec_sentence1, vec_sentence2)
예제 #20
0
    def get_similarity_for_article_and_cluster(self, article, cluster):
        article.pre_process(self.postagger, self.stop_words, self.n_keywords)
        word_tfidfs = article.title_content_effective_word_tfidfs
        similarity = np.mean(
            [matutils.cossim(article.title_content_effective_word_tfidfs, word_tfidfs) for article in cluster.articles])

        return similarity
    def mapper(self, _, line):
        read_line = list(next(csv.reader([line], delimiter="\t")))
        user = read_line[0]
        sim_user = read_line[1]

        user_df = self.df[self.df["user"] == user]
        user_rest = list(user_df["rest"].values)
        sim_user_df = self.df[(self.df["user"] == sim_user)
                              & (~self.df["rest"].isin(user_rest))]

        for i in range(len(user_df)):
            rest = user_df.iloc[i]["rest"]
            la = float(user_df.iloc[i]["la"])
            lon = float(user_df.iloc[i]["lon"])
            # strip unneeded symbols
            vec = ast.literal_eval(user_df.iloc[i]["vec"])
            lsi1 = self.lsi[vec]
            for j in range(len(sim_user_df)):
                sim_rest = sim_user_df.iloc[j]["rest"]
                sim_la = float(sim_user_df.iloc[j]["la"])
                sim_lon = float(sim_user_df.iloc[j]["lon"])
                sim_vec = ast.literal_eval(sim_user_df.iloc[j]["vec"])
                lsi2 = self.lsi[sim_vec]
                sim_score = cossim(lsi1, lsi2)
                dist = haversine_distance((la, lon), (sim_la, sim_lon))
                yield None, (user, sim_user, rest, sim_rest, sim_score, dist)
예제 #22
0
def calculate_lsi_requested_sim_in_df(model, dictionary, doc1ID, doc2ID,
                                      doc_data, IDfield, contentField,
                                      stoplist):
    """
    Expects a dataframe that actually holds the text associated with each doc (which
    is just passed in as strings that denote the id)
    Built in some flexibility for different field naming conventions by making
    the IDfield and contentField variables
    """
    # sim_output = []
    # for pair in pairs:
    #     low_id = pair[0]
    #     high_id = pair[1]
    #     idea1 = lemmatize_an_idea(all_ideas[all_ideas.ideaID == low_id].idea.values[0])
    #     idea2 = lemmatize_an_idea(all_ideas[all_ideas.ideaID == high_id].idea.values[0])
    #     vec1 = model[dictionary.doc2bow(idea1)]
    #     vec2 = model[dictionary.doc2bow(idea2)]
    #     sim_output.append((low_id, high_id,cossim(vec1,vec2)))

    text1 = lemmatize_an_idea(
        doc_data[doc_data[IDfield] == doc1ID][contentField].values[0],
        stoplist)
    text2 = lemmatize_an_idea(
        doc_data[doc_data[IDfield] == doc2ID][contentField].values[0],
        stoplist)
    vec1 = model[dictionary.doc2bow(text1)]
    vec2 = model[dictionary.doc2bow(text2)]

    return cossim(vec1, vec2)
예제 #23
0
def cossim_pairs(topic_models, num_topics=20):
    topic_cos_map = {}

    num_topics = 20
    for i, m in enumerate(topic_models):
        cur_date = temp_dates[i]
        month = int(cur_date.split("-")[1])
        
        if month < 9:
            break
        if month != 9:
            continue
        
        for u in range(num_topics):
            for j, n in enumerate(topic_models):
                if i == j:
                    continue
                top_cs = -1
                top_topic = ""
                for v in range(num_topics):
                    cs = cossim(m.show_topic(u), n.show_topic(v))
                    if cs > top_cs:
                        top_cs = cs
                        top_topic = "{}:{}_{}:{}".format(i, u, j, v)

                topic_cos_map[top_topic] = top_cs
    return topic_cos_map
예제 #24
0
 def tfidf_sim(self, train_data, body_dict, threshold):
     '''
     :param 
     train_data : a list of training samples of type ['headline', 'bodyID', 'stance']
     body_dict : a dictionary of values containing {bodyID:'bodyText'}
     threshold : used distinguish between similar and not similar
     '''
     bodyText_list = list(body_dict.values())
     bodyIds_index = {k:index for index, k in enumerate(body_dict.keys())}
     
     bodyText_w = [sent2stokens_wostop(text) for text in bodyText_list]
     
     vocab = corpora.Dictionary(bodyText_w)
     corporaBody_bow = [vocab.doc2bow(text) for text in bodyText_w]
     tfidf_model = models.TfidfModel(corporaBody_bow)
     
     unrelated, related, y_true, y_pred = [], [], [], []
     for headline, bodyID, stance in train_data:        
         headline_bow = vocab.doc2bow(sent2stokens_wostop(headline))
         
         headlines_tfidf = tfidf_model[headline_bow]
         corporaBody_tfidf = tfidf_model[corporaBody_bow[bodyIds_index[bodyID]]]
         
         sim = cossim(headlines_tfidf, corporaBody_tfidf)
         unrelated, related, y_true, y_pred = create_lists(sim, stance, threshold, [unrelated, related, y_true, y_pred])
     
     print_results([unrelated, related, y_true, y_pred], self.model_type)      
예제 #25
0
    def predict_score(self, user_id, item_id):

        ratings = Rating.objects.filter(user_id=user_id)
        rated_movies = {r['movie_id']: r['rating'] for r in ratings.values()}

        md = MovieDescriptions.objects.filter(imdb_id=item_id).first()
        rated_movies_desc = MovieDescriptions.objects.filter(
            imdb_id__in=rated_movies.keys())

        if md is None:
            return 0

        if rated_movies_desc is None:
            return 0
        if rated_movies_desc.count() == 0:
            return 0

        top = 0.0
        bottom = 0.0

        for rm in rated_movies_desc:
            lda_vector = self.corpus[int(md.lda_vector)]
            lda_vector_sim = self.corpus[int(rm.lda_vector)]
            sim = matutils.cossim(lda_vector, lda_vector_sim)
            rating = rated_movies[rm.imdb_id]

            top += sim * float(rating)
            bottom += sim

        return top / bottom
예제 #26
0
 def assign_article_topics(self, article_id, heading, process_all = False):
     """ Assign the appropriate topics to the given article in the database """
     if self._dictionary is None:
         self.load_dictionary()
     if self._tfidf is None:
         self.load_tfidf_model()
     if self._model is None:
         self.load_lsi_model()
     if self._topics is None:
         self.load_topics()
     with SessionContext(commit = True) as session:
         q = session.query(Word.stem, Word.cat, Word.cnt) \
             .filter(Word.article_id == article_id).all()
         wlist = []
         for stem, cat, cnt in q:
             # Convert stem to lowercase and replace spaces with underscores
             w = w_from_stem(stem, cat)
             if cnt == 1:
                 wlist.append(w)
             else:
                 wlist.extend([w] * cnt)
         topics = []
         article_vector = []
         if self._topics and wlist:
             bag = self._dictionary.doc2bow(wlist)
             tfidf = self._tfidf[bag]
             article_vector = self._model[tfidf]
             topic_names = []
             if self._verbose:
                 print("{0} : {1}".format(article_id, heading))
             for topic_id, topic_info in self._topics.items():
                 topic_name = topic_info["name"]
                 topic_vector = topic_info["vector"]
                 topic_threshold = topic_info["threshold"]
                 # Calculate the cosine similarity between the article and the topic
                 similarity = matutils.cossim(article_vector, topic_vector)
                 if self._verbose:
                     print("   Similarity to topic {0} is {1:.3f}".format(topic_name, similarity))
                 if similarity >= topic_threshold:
                     # Similar enough: this is a topic of the article
                     topics.append(topic_id)
                     topic_names.append((topic_name, similarity))
             if topic_names and not process_all:
                 print("Article '{0}':\n   topics {1}".format(heading, topic_names))
         # Topics found (if any): delete previous ones (if any)
         session.execute(ArticleTopic.table().delete().where(ArticleTopic.article_id == article_id))
         # ...and add the new ones
         for topic_id in topics:
             session.add(ArticleTopic(article_id = article_id, topic_id = topic_id))
         # Update the indexed timestamp and the article topic vector
         a = session.query(Article).filter(Article.id == article_id).one_or_none()
         if a is not None:
             a.indexed = datetime.utcnow()
             if article_vector:
                 # Store a pure list of floats
                 topic_vector = [ t[1] for t in article_vector ]
                 a.topic_vector = json.dumps(topic_vector)
             else:
                 a.topic_vector = None
예제 #27
0
def getComparable(source_lsi_doc, target_lsi_corpus):
	sims = []
	for i in range(len(target_lsi_corpus)):
		sims.append( matutils.cossim(source_lsi_doc, target_lsi_corpus[i]) )
	sortedSims = sorted(enumerate(sims), key=lambda item: -item[1])
	topIndex = sortedSims[0][0]
	topSim = sortedSims[0][1]
	return sortedSims[0]
def title_similarity(t1, t2):
    if not t1 or not t2:
        return -2
    else:
        return cossim(
            tokens_to_lsi(t1),
            tokens_to_lsi(t2)
        )
def description_similarity(d1, d2):
    if not d1 or not d2:
        return -2
    else:
        return cossim(
            tokens_to_lsi(d1),
            tokens_to_lsi(d2)
        )
예제 #30
0
def getComparable(source_lsi_doc, target_lsi_corpus):
    sims = []
    for i in range(len(target_lsi_corpus)):
        sims.append(matutils.cossim(source_lsi_doc, target_lsi_corpus[i]))
    sortedSims = sorted(enumerate(sims), key=lambda item: -item[1])
    topIndex = sortedSims[0][0]
    topSim = sortedSims[0][1]
    return sortedSims[0]
예제 #31
0
    def compute_similarity(self, doc1, doc2):
        """Compute the cosine similarity between two documents.

        :doc1: a list of strings, representing the first document.
        :doc2: a list of strings, representing the second document.
        :returns: a number between -1 and 1, representing the similarity
        between the two documents.
        """
        return cossim(self.get_vector(doc1), self.get_vector(doc2))
예제 #32
0
    def compute_similarity(self, doc1, doc2):
        """Compute the cosine similarity between two documents.

        :doc1: a list of strings, representing the first document.
        :doc2: a list of strings, representing the second document.
        :returns: a number between -1 and 1, representing the similarity
        between the two documents.
        """
        return cossim(self.get_vector(doc1), self.get_vector(doc2))
예제 #33
0
 def assign_article_topics(self, article_id, heading):
     """ Assign the appropriate topics to the given article in the database """
     if self._dictionary is None:
         self.load_dictionary()
     if self._tfidf is None:
         self.load_tfidf_model()
     if self._model is None:
         self.load_lda_model()
     if self._topics is None:
         self.load_topics()
     with SessionContext(commit=True) as session:
         q = session.query(Word.stem, Word.cat, Word.cnt) \
             .filter(Word.article_id == article_id).all()
         wlist = []
         for stem, cat, cnt in q:
             # Convert stem to lowercase and replace spaces with underscores
             w = stem.lower().replace(" ", "_") + "/" + cat
             if cnt == 1:
                 wlist.append(w)
             else:
                 wlist.extend([w] * cnt)
         topics = []
         if self._topics and wlist:
             bag = self._dictionary.doc2bow(wlist)
             tfidf = self._tfidf[bag]
             article_vector = self._model[tfidf]
             topic_names = []
             if self._verbose:
                 print("{0} : {1}".format(article_id, heading))
             for topic_id, topic_info in self._topics.items():
                 topic_name = topic_info["name"]
                 topic_vector = topic_info["vector"]
                 topic_threshold = topic_info["threshold"]
                 # Calculate the cosine similarity betwee the article and the topic
                 similarity = matutils.cossim(article_vector, topic_vector)
                 if self._verbose:
                     print("   Similarity to topic {0} is {1:.3f}".format(
                         topic_name, similarity))
                 if similarity >= topic_threshold:
                     # Similar enough: this is a topic of the article
                     topics.append(topic_id)
                     topic_names.append((topic_name, similarity))
             if topic_names:
                 print("Article '{0}': topics {1}".format(
                     heading, topic_names))
         # Topics found (if any): delete previous ones (if any)
         session.execute(ArticleTopic.table().delete().where(
             ArticleTopic.article_id == article_id))
         # ...and add the new ones
         for topic_id in topics:
             session.add(
                 ArticleTopic(article_id=article_id, topic_id=topic_id))
         # Update the indexed timestamp
         a = session.query(Article).filter(
             Article.id == article_id).one_or_none()
         if a:
             a.indexed = datetime.utcnow()
예제 #34
0
def getMaxSimilarity(dictTopic, vector):
    maxValue = 0
    maxIndex = -1
    for k, cluster in dictTopic.items():
        oneSimilarity = np.mean([matutils.cossim(vector, v) for v in cluster])
        if oneSimilarity > maxValue:
            maxValue = oneSimilarity
            maxIndex = k
    return maxIndex, maxValue
예제 #35
0
def wordsim(left, right):
    leftvec = unidict.doc2bow(left.lower().split())
    rightvec = unidict.doc2bow(right.lower().split())
    leftlsi = unilsi[leftvec]
    rightlsi = unilsi[rightvec]
    #leftlda = unilda[leftvec] # matutils.sparse2full(..., 300)
    #rightlda = unilda[rightvec]

    return {"lsi": matutils.cossim(leftlsi, rightlsi),}
예제 #36
0
def vsm_dist(song_A, song_B):
	# try:
	tif = models.TfidfModel(a_corps)
	a_tif = tif[song_A['tokenized_comments']]
	b_tif = tif[song_B['tokenized_comments']]
	dist =  matutils.cossim(a_tif, b_tif)
	if dist == 0: 
		dist = 0.0000001#avoid the div by 0
	return dist
예제 #37
0
    def get_max_similarity(self, article):
        title_content_word_tfidfs = article.title_content_effective_word_tfidfs
        title_content_max_sim = 0
        title_content_max_sim_cluster_id = -1

        content_word_tfidfs = article.content_effective_word_tfidfs
        content_max_sim = 0
        content_max_sim_cluster_id = -1

        title_word_tfidfs = article.title_effective_word_tfidfs
        title_max_sim = 0
        title_max_sim_cluster_id = -1

        for i in np.arange(len(self.clusters)):
            cluster = self.clusters[i]
            # title_content
            title_content_similarity = np.mean([
                matutils.cossim(article.title_content_effective_word_tfidfs,
                                title_content_word_tfidfs)
                for article in cluster.articles
            ])
            if title_content_similarity > title_content_max_sim:
                title_content_max_sim = title_content_similarity
                title_content_max_sim_cluster_id = i

            content_similarity = np.mean([
                matutils.cossim(article.content_effective_word_tfidfs,
                                content_word_tfidfs)
                for article in cluster.articles
            ])
            if content_similarity > content_max_sim:
                content_max_sim = content_similarity
                content_max_sim_cluster_id = i

            title_similarity = np.mean([
                matutils.cossim(article.title_effective_word_tfidfs,
                                title_word_tfidfs)
                for article in cluster.articles
            ])
            if title_similarity > title_max_sim:
                title_max_sim = title_similarity
                title_max_sim_cluster_id = i

        return title_content_max_sim, title_content_max_sim_cluster_id, content_max_sim, content_max_sim_cluster_id, title_max_sim, title_max_sim_cluster_id
예제 #38
0
 def _get_doc_similarity(self, doc1_tk, doc2_tk):
     """
     :param doc1_tk: Preprocessed documents as tokens
     :param doc2_tk: Preprocessed documents as tokens
     :return:
     """
     dis1 = self.get_topic_distrb(doc1_tk)
     dis2 = self.get_topic_distrb(doc2_tk)
     # return 1 - matutils.hellinger(dis1, dis2)
     return matutils.cossim(dis1, dis2)
예제 #39
0
    def similarity_lsi(self, text1, text2):
        # convert text into bag of words model
        text1_bow = self.__dictionary.doc2bow(self.__preprocess_text_document(text1))
        text2_bow = self.__dictionary.doc2bow(self.__preprocess_text_document(text2))

        # transform text into the model's domain
        text1_model = self.__model_lsi[text1_bow]
        text2_model = self.__model_lsi[text2_bow]

        return cossim(text1_model, text2_model)
예제 #40
0
파일: rvw_sim.py 프로젝트: Shuting05/WACC
 def reducer(self, key, value):
     lst_of_rvws = list(value)[0]
     rvws1 = lst_of_rvws[0]
     rvws2 = lst_of_rvws[1]
     # print(rvws1)
     sim = cossim(self.dct.doc2bow(rvws1), self.dct.doc2bow(rvws2))
     # similarity.append((biz[i], biz[j], sim))
     # print('reducer')
     join_key = str(key[0]) + '\t' + str(key[1])
     yield join_key, str(sim)
예제 #41
0
def calculate_lsi_requested_sim(model, dictionary, text1, text2, stoplist):
    """
    Base version that just takes in two strings and spits out a similarity in the provided gensim model space
    """

    text1_lemm = lemmatize_an_idea(text1, stoplist)
    text2_lemm = lemmatize_an_idea(text2, stoplist)
    vec1 = model[dictionary.doc2bow(text1_lemm)]
    vec2 = model[dictionary.doc2bow(text2_lemm)]

    return cossim(vec1, vec2)
def get_abstract_similarity(row):
    if pd.notnull(row['abstract_lr']) and pd.notnull(row['abstract_cp']):
        lr_doc = preprocess_doc(row['abstract_lr'])
        cp_doc = preprocess_doc(row['abstract_cp'])
        lr_bow = abstract_dict.doc2bow(lr_doc)
        cp_bow = abstract_dict.doc2bow(cp_doc)
        lr_lsi = abstract_lsi[lr_bow]
        cp_lsi = abstract_lsi[cp_bow]
        return matutils.cossim(lr_lsi, cp_lsi)
    else:
        return 0
예제 #43
0
def calculate_best_score(candidate_id, reference_ids):
    # translate candidate to LSI vector
    vec_lsi = id2lsi_vec(candidate_id)

    # determine similarities for each of the references
    scores = [
        matutils.cossim(vec_lsi, id2lsi_vec(reference_id))
        for reference_id in reference_ids
    ]

    return max(scores)
 def getMaxSimilarity(self, dictTopic, vector):
     # 计算新进入文档和已有文档的文本相似度,这里的相似度采用的是cosine余弦相似度
     maxValue = 0
     maxIndex = -1
     for k, cluster in dictTopic.items():
         oneSimilarity = np.mean(
             [matutils.cossim(vector, v) for v in cluster])
         if oneSimilarity > maxValue:
             maxValue = oneSimilarity
             maxIndex = k
     return maxIndex, maxValue
예제 #45
0
    def compare_strings(self, s1, s2):
        # Build vector for doc1 
        vec_bow1 = self.dictionary.doc2bow(s1.lower().split())
        vec_lsi1 = self.lsi[vec_bow1] # convert the query to LSI space
        
        # Build vector for doc2
        vec_bow2 = self.dictionary.doc2bow(s2.lower().split())
        vec_lsi2 = self.lsi[vec_bow2]

        # Calculate cosine similarity
        sim = matutils.cossim(vec_lsi1, vec_lsi2)
        return sim
예제 #46
0
 def get_link_scores(self, source, target):
     """
     :param doc1_tk: Preprocessed documents as tokens
     :param doc2_tk: Preprocessed documents as tokens
     :return:
     """
     doc1_tk = source['tokens'].split()
     doc2_tk = target['tokens'].split()
     dis1 = self.get_topic_distrb(doc1_tk)
     dis2 = self.get_topic_distrb(doc2_tk)
     # return 1 - matutils.hellinger(dis1, dis2)
     return matutils.cossim(dis1, dis2)
def loadDocuments():
	documents = []

	for info in LIST_OF_TRANSCRIPTS_INFO:		
		for transcriptPath in os.listdir(info[0]):
			document = loadFileIntoList(info[0] + "/" + transcriptPath, info[1], info[2], info[3])
			documents.append(document)
	
	for doc in documents:
		last_sentence_lsi = []
		last_sentence_lda = []
		for sentence in doc:
			sentence_bow = dictionary.doc2bow(sentence.split(" "))
			print(sentence)
			#print(lsi[tfidf[sentence_bow]])
			#print(lda[sentence_bow])
			sim_lsi = matutils.cossim(lsi[tfidf[sentence_bow]], last_sentence_lsi)
			sim_lda = matutils.cossim(lda[sentence_bow], last_sentence_lda)
			print(sim_lsi)
			print(sim_lda)
			last_sentence_lsi = lsi[tfidf[sentence_bow]]
			last_sentence_lda = lda[sentence_bow]
예제 #48
0
def ComputerSimilarityOld(model, corpus, blocks):
	#print blocks;
	#print(s);
	vectors = [];
	##There are 2 blocks, the one before the sentence and the one after

	for block in blocks:
		#print(block);
		words = corpus.proc(block)
		topic_distribution = {};
		words_culled = [];
		for word in words:
			block_lda = model[corpus.dictionary.doc2bow(corpus.proc(word))];
			topic_id = 0;
			prob = 0;
			for t in block_lda:
				if t[1] > prob:
					topic_id = t[0];
					prob = t[1]

			if (prob < .1):
				continue;

			ppp = 0;
			word_id = corpus.dictionary.token2id[word];
			if WordProbThreshold:
				terms = model.get_topic_terms(topic_id,100000);
				for term in terms:
					if term[0] == word_id:
						ppp = term[1];

			if (ppp < X and WordProbThreshold):
				print("CUT  " + word + "    " + str(ppp));
				continue;

			print(word + "    " + str(ppp));
			words_culled.append(word)
			if topic_distribution.has_key(topic_id):
				topic_distribution[topic_id] = topic_distribution[topic_id]+1;
			else:
				topic_distribution[topic_id] = 1;
		print("-------------");
			#print((topic_id, prob));

		#print(model[corpus.dictionary.doc2bow(words_culled)])
		#print(topic_distribution)
		#print("bleh")
		vectors.append(topic_distribution.items());

	dot_product = matutils.cossim(vectors[0],vectors[1])
	return dot_product;
예제 #49
0
    def score(query, profile, data=None):
        if not len(profile.description):
            return [-1]

        vectorspace = VectorSpace([])

        tokenized_query = LowerTokenizer.tokenize(query)
        tokenized_description = LowerTokenizer.tokenize(profile.description)

        query_vector = vectorspace.vector_for_document(
            tokenized_document=tokenized_query,
            update=True)

        description_vector = vectorspace.vector_for_document(
            tokenized_document=tokenized_description,
            update=True)

        return [cossim(description_vector, query_vector)]
예제 #50
0
    def test_get_similarities(self, mock_q_simmx_file_path, mock_a_simmx_file_path, mock_dict_file_path, mock_md_file_path):
        mock_md_file_path.return_value = self.test_md_file_path
        mock_dict_file_path.return_value = self.test_dict_file_path
        mock_q_simmx_file_path.return_value = self.test_q_simmx_file_path
        mock_a_simmx_file_path.return_value = self.test_a_simmx_file_path

        model_struct = TfIdfModelStruct.get_model(data_store=self.data_store)

        query_doc = "Is brocolli tasty to eat?"
        compare_docs = self.data_store.doc_set

        sims = model_struct.get_similarities(query_doc, compare_docs)

        for idx, sim in enumerate(sims):
            expected_sim = cossim(
                model_struct.get_tfidf_vec(query_doc),
                model_struct.get_tfidf_vec(compare_docs[idx])
            )
            self.assertAlmostEqual(sim[1], expected_sim)
예제 #51
0
def get_query_entity_document_score(query_document_tfidf_vector,candidate_entity_id):
    try:
        entity_document_tfidf_vector = temp_entity_id_content_vec_dic[candidate_entity_id]
    except KeyError as ke:
        content_text = ""
        fr = open(entity_documents_file_path + candidate_entity_id, 'r')
        line = fr.readline().decode(coding)
        while(line):
            line = line.strip()
            if(line != ''):
                content_text += line + " "
            line = fr.readline().decode(coding)
        fr.close()
        entity_document_tfidf_vector = tfidf_model[tfidf_dictionary.doc2bow(content_text.strip().lower().split())]
        temp_entity_id_content_vec_dic[candidate_entity_id] = entity_document_tfidf_vector

    score = matutils.cossim(query_document_tfidf_vector,entity_document_tfidf_vector)

    return score
예제 #52
0
def tfidf_distance(corpora, tfidf, tfidf_web, mean_vec, web_2, loss_weight):
    """compute the distance (as a function of cosine similarity) between two websites using tfidf model"""

    if len(mean_vec) == 0:
        return loss_weight

    try:
        indx_2 = tfidf_web.values().index(web_2)
    except ValueError:
        return loss_weight

    doc_num_2 = tfidf_web.keys()[indx_2]

    bow_2 = corpora[doc_num_2]

    tf_rap_2 = matutils.unitvec(tfidf[bow_2])       # get its tfidf representation

    cosine_sim = min(matutils.cossim(mean_vec, tf_rap_2), 1.0)

    return sqrt(2.0 * (1.0 - cosine_sim)) / 2.0               # return the distance of the two vectors
예제 #53
0
    def value_for_text(self, t, rp=default_rp):
        space = rp.lsa_space()
        num_topics = space.num_topics

        tokens = rp.tokens(t)
        tokens = [[token.lower() for token in sentence] for sentence in tokens]

        if len(tokens) < 2:
            return 0

        spans = np.zeros(len(tokens) - 1)
        for i in range(1, len(tokens)):
            past_sentences = tokens[:i]
            span_dim = len(past_sentences)

            if span_dim > num_topics - 1:
                # It's not clear, from the papers I read, what should be done
                # in this case. I did what seemed to not imply in loosing
                # information.
                beginning = past_sentences[0:span_dim - num_topics]
                past_sentences[0] = list(chain.from_iterable(beginning))

            past_vectors = [sparse2full(space.get_vector(sent), num_topics)
                            for sent in past_sentences]

            curr_vector = sparse2full(space.get_vector(tokens[i]), num_topics)
            curr_array = np.array(curr_vector).reshape(num_topics, 1)

            A = np.array(past_vectors).transpose()

            projection_matrix = dot(dot(A,
                                        pinv(dot(A.transpose(),
                                                 A))),
                                    A.transpose())

            projection = dot(projection_matrix, curr_array).ravel()

            spans[i - 1] = cossim(full2sparse(curr_vector),
                                  full2sparse(projection))

        return self.get_value(spans)
예제 #54
0
def cosine_by_tfidf(list1, list2, feature):

    cosine_value = 0.0

    intersection_set = set(list1) & set(list2)

    if intersection_set:
        union_set = set(list1) | set(list2)

        feature_count = {}
        for item in union_set:
            feature_count[item] = reference_dict[feature][item]
            
            # feature_count[item] = collection.find({feature: {'$in': [item]}}).count()

        list1_dict = dict(Counter(list1))
        list2_dict = dict(Counter(list2))

        # pprint(list1_dict)
        # pprint(list2_dict)

        num1 = len(list1)
        num2 = len(list2)

        list1_tfidf_dict = {}
        for k, v in list1_dict.items():
            list1_tfidf_dict[k] = (v / num1) * math.log(movie_count / feature_count[k])

        list2_tfidf_dict = {}
        for k, v in list2_dict.items():
            list2_tfidf_dict[k] = (v / num2) * math.log(movie_count / feature_count[k])


        # pprint(list1_tfidf_dict)
        # pprint(list2_tfidf_dict)

        cosine_value = matutils.cossim(list1_tfidf_dict, list2_tfidf_dict)

    # print cosine_value
    return cosine_value
예제 #55
0
    def query(self, query, sorted=False):
        """Given a search query, returns a list of document IDs and their
        cosine similarity to that query.

        If sorted is True, results are returned sorted in descending order of
        cosine similarity. Otherwise, results are returned ordered by document
        ID."""
        tokenized = self.tokenizer.tokenize(query)
        query_vector = self.vectorspace.vector_for_document(tokenized,
                                                            update=True)

        results = []
        for index, document in enumerate(self.documents):
            tokenized = self.tokenizer.tokenize(document)
            document_vector = self.vectorspace.vector_for_document(tokenized)
            similarity = cossim(query_vector, document_vector)
            results.append((index, similarity))

        if sorted:
            return sorted(results, key=lambda item: -item[1])

        return results
예제 #56
0
    def predict(self, X_test, acronym):
        labels = []
        confidences = []
        for test_lda in X_test:
            similarities = map(
                lambda article_lda: cossim(article_lda, test_lda), self.X_train_lda)
            similarityByLabel = sorted(
                zip(similarities, self.y_train_labels), key=lambda item: item[0], reverse=True)

            chosen_label = similarityByLabel[0][1]
            max_similarity = similarityByLabel[0][0]

            similarityByLabelForNotChosen = [
                item for item in similarityByLabel if item[1] != chosen_label]

            confidence = min_confidence
            if(similarityByLabelForNotChosen):
                confidence = max_similarity - \
                    similarityByLabelForNotChosen[0][0]

            labels.append(chosen_label)
            confidences.append(confidence)
        
        return labels, confidences
    def cossineSim(self, N):
        print "Cossine sim"
        simMatrixCos = []

        for x in range(0,50):
            topicMatrixCos = []
            for y in range(0,50):
                model = self.model

                vec1 = model.get_topic_terms(x, topn=model.num_terms)


                #ldaVec1 = sorted(model.get_topic_terms(x, topn=model.num_terms))
                #ldaVec2 = sorted(model.get_topic_terms(y, topn=model.num_terms))

                ldaVec1 = model.get_topic_terms(x, topn = N)
                ldaVec2 = model.get_topic_terms(y, topn = N)

                #dense1 = gensim.matutils.sparse2full(ldaVec1, model.num_terms)
                #dense2 = gensim.matutils.sparse2full(ldaVec2, model.num_terms)

                sim = matutils.cossim(ldaVec1, ldaVec2)
                #simDict = (x, y, sim, self.model.show_topic(y))
                simDict = (x, y, sim)



                topicMatrixCos.append(simDict)

            simMatrixCos.append(sorted(topicMatrixCos, key=itemgetter(2), reverse=True))

        #for element in simMatrix:
        #    print simMatrix
        #topic1Sorted = sorted(simMatrixCos[0], key=itemgetter(2))
        #x = topic1Sorted
        return simMatrixCos
def cosine_similarity(topics, segmented_topics, per_topic_postings, measure, gamma, num_docs):
    """
    This function calculates the indirect cosine measure. Given context vectors
    _   _         _   _
    u = V(W') and w = V(W*) for the word sets of a pair S_i = (W', W*) indirect
                                                                _     _
    cosine measure is computed as the cosine similarity between u and w. The formula used is:

    m_{sim}_{(m, \gamma)}(W', W*) = s_{sim}(\vec{V}^{\,}_{m,\gamma}(W'), \vec{V}^{\,}_{m,\gamma}(W*))

    where each vector \vec{V}^{\,}_{m,\gamma}(W') = \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|}

    Args:
    ----
    topics : Topics obtained from the trained topic model.
    segmented_topics : segmented_topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples.
    per_topic_postings : Output from the probability_estimation module. Is a dictionary of the posting list of all topics.
    measure : String. Direct confirmation measure to be used. Supported values are "nlr" (normalized log ratio).
    gamma : Gamma value for computing W', W* vectors.
    num_docs : Total number of documents in corresponding corpus.

    Returns:
    -------
    s_cos_sim : array of cosine similarity of the context vectors for each segmentation
    """
    if measure == 'nlr':
        # make normalized log ratio measure tuple
        measure = (direct_confirmation_measure.log_ratio_measure, True)
    else:
        raise ValueError("The direct confirmation measure you entered is not currently supported.")
    backtrack = {}  # Backtracking dictionary for storing measure values of topic id tuples eg. (1, 2).
    """
    For backtracking context vectors, we will create a list called w_backtrack to store (w_prime, w) or
    (w_star, w) tuples and a corresponding list context_vector_backtrack which will create a
    mapping of (w_prime or w_star, w) ---> context_vector.
    """
    w_backtrack = []
    context_vector_backtrack = []
    s_cos_sim = []
    for top_words, s_i in zip(topics, segmented_topics):
        for w_prime, w_star in s_i:
            # Step 1. Check if (w_prime, top_words) tuple in w_backtrack.
            # Step 2. If yes, return corresponding context vector
            w_prime_index = _present(w_prime, top_words, w_backtrack)
            if w_backtrack and w_prime_index != -1:
                w_prime_context_vectors = context_vector_backtrack[w_prime_index]
            else:
                w_prime_context_vectors, backtrack_i = _make_seg(w_prime, top_words, per_topic_postings, measure, gamma, backtrack, num_docs)
                backtrack.update(backtrack_i)
                # Update backtracking lists
                w_backtrack.append((w_prime, top_words))
                context_vector_backtrack.append(w_prime_context_vectors)

            # Step 1. Check if (w_star, top_words) tuple in w_backtrack.
            # Step 2. If yes, check if corresponding w is the same
            w_star_index = _present(w_star, top_words, w_backtrack)
            if w_backtrack and w_star_index != -1:
                w_star_context_vectors = context_vector_backtrack[w_star_index]
            else:
                w_star_context_vectors, backtrack_i = _make_seg(w_star, top_words, per_topic_postings, measure, gamma, backtrack, num_docs)
                backtrack.update(backtrack_i)
                # Update all backtracking lists
                w_backtrack.append((w_star, top_words))
                context_vector_backtrack.append(w_star_context_vectors)

            s_cos_sim_i = cossim(w_prime_context_vectors.items(), w_star_context_vectors.items())
            s_cos_sim.append(s_cos_sim_i)

    return s_cos_sim