Пример #1
0
    def getSimilarity4Statements(self, folderpath):
        """Get similarity between candiadate statements and target statement."""
        getSimilarity = Evaluate.GetSimilarity('tfidf', self.rootpath)
        tokens_candidates, id2claims = getSimilarity.getCorpusOfCandidateClaims(
            folderpath)
        tokens_target = getSimilarity.getCorpusOfTargetClaim(folderpath)
        # print("tokens_statements", len(tokens_statements))
        # print("tokens_target", tokens_target)
        vectors_candidates = getSimilarity.getVector(tokens_candidates)
        vector_target = getSimilarity.getVector(tokens_target)
        # print("vectors_candidates", vectors_candidates[0:1].shape)
        # print("vector_target", vector_target.shape)
        similarities = getSimilarity.getCosineSimilarity(
            vectors_candidates, vector_target)
        # print(similarities, vectors_candidates)
        # print(similarities[0], len(similarities[0]))

        # average, max, min
        maximum = max(similarities[0])
        print("max: {}".format(maximum))

        id2similarities = dict(enumerate(list(similarities[0])))
        data = []
        for key in id2claims.keys():
            data.append([id2claims[key], id2similarities[key]])
        self.helper.dumpCsv(folderpath + "/final", "similarities.csv",
                            ['statement', 'similarity'], data)
Пример #2
0
    def getSimilarityStatements2Tweets(self, folderpath):
        """Get similarity between candidate claims and tweets.

        Arguments:
            folderpath {str} -- the path to data folder

        Returns:
            None -- index_tweet_2_index_candidate_claim.json;
                    index_candidate_claim_2_index_tweet.json;
                    index_candidate_claim_2_tweet.json are generated.
        """
        getSimilarity = Evaluate.GetSimilarity('tfidf', self.rootpath)
        tokens_claims, id2claims = getSimilarity.getCorpusOfCandidateClaims(
            folderpath)
        print("length of statements ", len(tokens_claims))

        tokens_tweets, id2tweets = getSimilarity.getCorpusOfTweets(folderpath)
        print("length of tweets ", len(tokens_tweets))

        # return None if any of them is None
        if len(tokens_claims) == 0 or len(tokens_tweets) == 0:
            print("no statements or tweets.")
            return
        vectors_claims = getSimilarity.getVector(tokens_claims)
        print("shape of vectors_claims ", vectors_claims.shape)
        vector_tweets = getSimilarity.getVector(tokens_tweets)
        print("shape of vector_tweets ", vector_tweets.shape)

        # shape is #vector_tweets x #vectors_candidates
        similarities = getSimilarity.getCosineSimilarity(
            vectors_claims, vector_tweets)
        print("shape of similarities ", similarities.shape)

        # get max indices of candidates statement for each tweet
        index_tweet_2_max_index_candidate_claim = enumerate(
            list(np.argmax(similarities, axis=1)))
        self.helper.dumpJson(folderpath + "/final",
                             "index_tweet_2_index_candidate_claim.json",
                             index_tweet_2_max_index_candidate_claim)
        # reverse the key and value
        max_index_candidate_claim_2_index_tweet = defaultdict(list)
        for tid, sid in index_tweet_2_max_index_candidate_claim:
            max_index_candidate_claim_2_index_tweet[sid].append(tid)
        self.helper.dumpJson(folderpath + "/final",
                             "index_candidate_claim_2_index_tweet.json",
                             max_index_candidate_claim_2_index_tweet)
        # generate {index_claim: [tweet1, tweet2, ...]}
        index_candidate_claim_2_tweet = defaultdict(list)
        for index_candidate_claim in max_index_candidate_claim_2_index_tweet.keys(
        ):
            for index_tweet in max_index_candidate_claim_2_index_tweet[
                    index_candidate_claim]:
                index_candidate_claim_2_tweet[index_candidate_claim].append(
                    id2tweets[index_tweet])
        self.helper.dumpJson(os.path.join(folderpath, "final"),
                             "index_candidate_claim_2_tweet.json",
                             index_candidate_claim_2_tweet)
        print("index_candidate_claim_2_tweet.json has been saved.")
Пример #3
0
    def getSimilarityStatements2Tweets(self, folderpath):
        """Get similarity between candidate statements and tweets.
        
        Arguments:
            folderpath {str} -- the path to data folder
        
        Returns:
            None -- index_candiadate_statement_2_index_tweet.json and index_tweet_2_index_candiadate_statement.json are generated.
        """
        getSimilarity = Evaluate.GetSimilarity('tfidf', self.rootpath)
        tokens_statements, id2candiadateStatements = getSimilarity.getCorpusFromCandidateStatements4Cluster(
            folderpath)
        print("length of statements ", len(tokens_statements))

        tokens_tweets, id2tweets = getSimilarity.getCorpusFromTweets4Cluster(
            folderpath)
        print("length of tweets ", len(tokens_tweets))

        # return None if any of them is None
        if len(tokens_statements) == 0 or len(tokens_tweets) == 0:
            print("no statements or tweets.")
            return
        vectors_candidates = getSimilarity.getVector(tokens_statements)
        print("shape of vectors_candidates ", vectors_candidates.shape)
        vector_tweets = getSimilarity.getVector(tokens_tweets)
        print("shape of vector_tweets ", vector_tweets.shape)

        # shape is #vector_tweets x #vectors_candidates
        similarities = getSimilarity.getCosineSimilarity(
            vectors_candidates, vector_tweets)
        print("shape of similarities ", similarities.shape)

        # get max indeices of candidates statement for each tweet
        index_tweet_2_max_index_candiadate_statement = enumerate(
            list(np.argmax(similarities, axis=1)))
        self.helper.dumpJson(
            folderpath + "/final",
            "index_tweet_2_index_candidate_statement.json",
            dict(index_tweet_2_max_index_candiadate_statement))
        # reverse the key and value
        max_index_candiadate_statement_2_index_tweet = defaultdict(list)
        for tid, sid in index_tweet_2_max_index_candiadate_statement:
            max_index_candiadate_statement_2_index_tweet[sid].append(tid)
        self.helper.dumpJson(folderpath + "/final",
                             "index_candidate_statement_2_index_tweet.json",
                             max_index_candiadate_statement_2_index_tweet)