예제 #1
class SimTopicLists:
    Compare similarities between topics in two topic lists
    def __init__(self):
        self.sim = Similarity()

    def bha_distance(self, t_list1, t_list2):
        Compare the Bhattacharyya Distance between each of two topics in two topic lists and store the results
        in a 2D list

        t_list1 contains t1_0, t1_1, t1_2
        t_list2 contains t2_0, t2_1, t2_2
        distance list:
        [[sim(t1_0,t2_0), sim(t1_0, t2_1), sim(t1_0, t2_2)],
        [sim(t1_1,t2_0), sim(t1_1, t2_1), sim(t1_1, t2_2)],
        [sim(t1_2,t2_0), sim(t1_2, t2_1), sim(t1_2, t2_2)]]

        The following distance methods have similar outputs
        :return: a 2D list stores the results
        distance_list = []

        for value1 in t_list1:
            sub_list = [
                self.sim.bha_distance(value1, value2) for value2 in t_list2

        return distance_list

    def kl_divergence(self, t_list1, t_list2):
        Compare the KL Divergence between each of two topics in two topic lists and store the results
        in a 2D list
        :return: a 2D list stores the results
        distance_list = []

        for value1 in t_list1:
            sub_list = [
                self.sim.kl_divergence(value1, value2) for value2 in t_list2

        return distance_list

    def cos_distance(self, t_list1, t_list2):
        Compare the cosine distance between each of two topics in two topic lists and store the results
        in a 2D list
        :return: a 2D list stores the results
        distance_list = []

        for value1 in t_list1:
            sub_list = [
                self.sim.cosine_distance(value1, value2) for value2 in t_list2

        return distance_list

    def kendall(self, t_list1, t_list2):
        Compare the kendall tau correlation between each of two topics in two topic lists and store the results
        in a 2D list
        :return: a 2D list stores the results
        distance_list = []

        for index1, value1 in enumerate(t_list1):
            sub_list = []
            for index2, value2 in enumerate(t_list2):
                result = self.sim.kendall_tau(value2, value1)

        return distance_list

    def dcg(self, t_list1, t_list2, word_limit=0):
        Compare the difference between dcg values of each two topics in two topic lists and store the results
        in a 2D list
        :return: a 2D list stores the results
        distance_list = []

        if word_limit == 0:
            word_limit = len(t_list1)

        for value1 in t_list1:
            sub_list = [
                self.sim.dcg_difference(value1, value2, word_limit)
                for value2 in t_list2

        return distance_list

    def jaccard(self, t_list1, t_list2, threshold):
        Compare the jaccard distance between each of two in two topic lists and store the results
        in a 2D list
        :return: a 2D list stores the
        distance_list = []

        for value1 in t_list1:
            sub_list = [
                self.sim.jaccard_distance(value1, value2, threshold)
                for value2 in t_list2

        return distance_list

    def write_distance(self, distance_list, ofile):
        Write distance values between two topics in a topic list
        Assume one topic list t_list1 contains t0, t1, t2

        :param distance_list: generated by one of similarity/distance methods above
        [[sim(t0,t0), sim(t0,t1), sim(t0,t2)],
        [sim(t1,t0), sim(t1, t1), sim(t1,t2)],
        [sim(t2,t0), sim(t2,t1), sim(t2,t2)]]

        :param ofile: the output file
        :return: a list of similarity/distance values
        [sim(t0,t1), sim(t0,t2), sim(t1,t2)]
        for i1, sublist in enumerate(distance_list):
            for i2, value in enumerate(sublist[i1 + 1:]):
                ofile.write(str(value) + "\n")

    def read_distance_list(self, ifile):
        Read a distance list from an output file
        :param ifile: the input file
        :return: a list of similarity/distance values
        dist_list = []
        for line in ifile:
        return dist_list

    def give_dist_names(self, dist_list, topics_count, corpus_type):
        Turn a distance list into a list of (topic_pair_name,similarity value) tuples
        This new list will be used in the calculation of kendalltau correlations among different similarity measures.

        :param dist_list: a distance list generated by a tfidf 3-topic LDA
        Suppose a topic list t_list1 contains t0, t1, t2
        and [sim(t0,t1), sim(t0,t2), sim(t1,t2)] corresponds to
        [0.3, 0.2, 0.4]
        :param topics_count: the number of topics in the topic list that related with the distance list
        :param corpus_type: the corpus type of the LDA that generates this topic list
        [(tfidf_t3t0_t1:0.3), (tfidf_t3t0_t2:0.2), (tfidf_t3t1_t2:0.4))

        dist_names = []

        index = 0
        for t1 in range(topics_count):
            for t2 in range(t1 + 1, topics_count):
                    (corpus_type + "_t" + str(topics_count) + "t" + str(t1) +
                     "_t" + str(t2), dist_list[index]))
                index += 1

        return dist_names

    # Different display methods

    def find_smallest(self, num_list):
        if num_list[0] < num_list[1]:
            row_min, row_min2, i1, i2 = num_list[0], num_list[1], 0, 1
            row_min, row_min2, i1, i2 = num_list[1], num_list[0], 1, 0

        for index, value in enumerate(num_list):
            if value < row_min:
                row_min2, i2 = row_min, i1
                row_min, i1 = value, index
            elif (row_min < value < row_min2) or (row_min == value
                                                  and i1 != index):
                row_min2, i2 = value, index

        return i1, i2

    def find_smallest_self(self, num_list):
        if num_list[0] < num_list[1]:
            row_min, row_min2, i1, i2 = num_list[0], num_list[1], 0, 1
            row_min, row_min2, i1, i2 = num_list[1], num_list[0], 1, 0

        for index, value in enumerate(num_list):
            if value < row_min:
                row_min2, i2 = row_min, i1
                row_min, i1 = value, index
            elif (row_min < value < row_min2) or (row_min == value
                                                  and i1 != index):
                row_min2, i2 = value, index

        return i1, i2

    def find_largest_two(self, num_list):
        if num_list[0] > num_list[1]:
            row_max, row_max2, i1, i2 = num_list[0], num_list[1], 0, 1
            row_max, row_max2, i1, i2 = num_list[1], num_list[0], 1, 0

        for index, value in enumerate(num_list):
            if value > row_max:
                row_max2, i2 = row_max, i1
                row_max, i1 = value, index
            elif (row_max > value > row_max2) or (row_max == value
                                                  and i1 != index):
                row_max2, i2 = value, index

        return i1, i2

    def find_largest_one(self, num_list):
        lmax = num_list[0]
        i = 0

        for index, value in enumerate(num_list):
            if value > lmax:
                lmax = value
                i = index
        return i

    def rank(self, nlist):
        # rank = 'a b c d e f g h i j k l m n o p q r s t u v w x y z'.split()
        rank = list(range(0, len(nlist)))

        sorted_list = list(sorted(nlist))
        ranklist = {}
        for index, num in enumerate(sorted_list):
            ranklist[num] = rank[index]

        newlist = []
        for index, num in enumerate(nlist):
                str(ranklist[num]) + " " + str('{0:.6f}'.format(num)))
        return newlist

    def show_results_rank(self, distance_list, file):
        Show the results from the comp_topic_lists method
        :param distance_list: a 2D list of data
        width = 14
            "\ntopic List 1 is vertical and topic List 2 is horizontal\n")

        # calculate smallest results
        # show row labels
        file.write("<style>table, th, td {border: 1px solid black;}</style>")
        for value in range(len(distance_list[0])):
            file.write("<th> topic" + str(value) + "</th>")

        colordiff = int(16777215 / (len(distance_list[0])))

        for index, sublist in enumerate(distance_list):
            # show column labels
            file.write("<tr><td> topic" + str(index) + "</td>")
            # rank
            sublist = self.rank(sublist)
            for sub_i, value in enumerate(sublist):
                ranknum = float(value.split()[0])
                colornum = int(16777215 - ranknum * colordiff)
                color = format(colornum, "06X")
                file.write("<td><span style='background-color: #" +
                           str(color) + "'>")

    def show_results_rank_bw(self, distance_list, file):
        Show the results from the comp_topic_lists method
        :param distance_list: a 2D list of data
        width = 14
            "\ntopic List 1 is vertical and topic List 2 is horizontal\n")

        # calculate smallest results
        # show row labels
        file.write("<style>table, th, td {border: 1px solid black;}</style>")
        for value in range(len(distance_list[0])):
            file.write("<th> topic" + str(value) + "</th>")

        colordiff = int(255 / (len(distance_list[0])))

        for index, sublist in enumerate(distance_list):
            # show column labels
            file.write("<tr><td> topic" + str(index) + "</td>")
            # rank
            sublist = self.rank(sublist)
            for sub_i, value in enumerate(sublist):
                ranknum = int(value.split()[0])
                colornum = str(255 - ranknum * colordiff)
                rgbstr = "rgb(" + colornum + "," + colornum + "," + colornum + ")"
                file.write("<td><span style='background-color: " + rgbstr +

    def show_results_rank_reverse(self, distance_list, file):
        Show the results from the comp_topic_lists method
        :param distance_list: a 2D list of data
        width = 14
            "\ntopic List 1 is vertical and topic List 2 is horizontal\n")

        # calculate smallest results
        # show row labels
        file.write("<style>table, th, td {border: 1px solid black;}</style>")
        for value in range(len(distance_list[0])):
            file.write("<th> topic" + str(value) + "</th>")

        colordiff = int(255.0 / (len(distance_list[0])))

        for index, sublist in enumerate(distance_list):
            # show column labels
            file.write("<tr><td> topic" + str(index) + "</td>")
            # rank
            sublist = self.rank(sublist)
            for sub_i, value in enumerate(sublist):
                ranknum = int(value.split()[0])
                colornum = str(ranknum * colordiff)
                rgbstr = "rgb(" + colornum + "," + colornum + "," + colornum + ")"
                file.write("<td><span style='background-color: " + rgbstr +

    def show_results_value(self, distance_list, file):
        Show the results from the comp_topic_lists method
        :param distance_list: a 2D list of data
        width = 14
            "\ntopic List 1 is vertical and topic List 2 is horizontal\n")

        # calculate smallest results
        # show row labels
        file.write("<style>table, th, td {border: 1px solid black;}</style>")
        for value in range(len(distance_list[0])):
            file.write("<th> topic" + str(value) + "</th>")

        max_value = max([max(v) for v in distance_list])
        min_value = min([min(v) for v in distance_list])

        for index, sublist in enumerate(distance_list):
            # show column labels
            file.write("<tr><td> topic" + str(index) + "</td>")
            for sub_i, value in enumerate(sublist):
                percent = (value - min_value) / (max_value - min_value)
                colornum = str(int(255 * (1 - percent)))
                rgbstr = "rgb(" + colornum + "," + colornum + "," + colornum + ")"
                file.write("<td><span style='background-color: " + rgbstr +
                value = '{0:.6f}'.format(value)

    def show_results_value_reverse(self, distance_list, file):
        Show the results from the comp_topic_lists method
        :param distance_list: a 2D list of data
        width = 14
            "\ntopic List 1 is vertical and topic List 2 is horizontal\n")

        # calculate smallest results
        # show row labels
        file.write("<style>table, th, td {border: 1px solid black;}</style>")
        for value in range(len(distance_list[0])):
            file.write("<th> topic" + str(value) + "</th>")

        max_list = []
        for sub in distance_list:
            max_list.append(max([v for v in sub if round(v, 6) != 1.000000]))
        max_value = max(max_list)
        min_value = min([min(v) for v in distance_list])

        for index, sublist in enumerate(distance_list):
            # show column labels
            file.write("<tr><td> topic" + str(index) + "</td>")
            for sub_i, value in enumerate(sublist):
                if round(value, 6) == 1.000000:
                    colornum = "255"
                    percent = (value - min_value) / (max_value - min_value)
                    colornum = str(int(255 * percent))

                rgbstr = "rgb(" + colornum + "," + colornum + "," + colornum + ")"
                file.write("<td><span style='background-color: " + rgbstr +
                value = '{0:.6f}'.format(value)

    def show_results(self, distance_list, file):
        Show the results from the comp_topic_lists method
        :param distance_list: a 2D list of data
        width = 14
            "\ntopic List 1 is vertical and topic List 2 is horizontal\n")

        # calculate smallest results
        # show row labels
        file.write("{:{w}}".format("T_List1/T_List2", w=width))
        for value in range(len(distance_list[0])):
            file.write('{:{w}}'.format("     topic" + str(value), w=width))

        for index, sublist in enumerate(distance_list):
            # show column labels
            file.write('{:{w}}'.format("    topic" + str(index), w=width))
            # find row and col max
            rmax = self.find_largest_one(sublist)
            cmax_list = []
            for i in range(len(sublist)):
                col_list = [v[i] for v in distance_list]

            for sub_i, value in enumerate(sublist):
                print value
                value = '{0:.6f}'.format(value)
                if sub_i == rmax and index != cmax_list[sub_i]:
                    file.write('{:{w}}'.format("|  **" + value, w=width))
                elif sub_i != rmax and index == cmax_list[sub_i]:
                    file.write('{:{w}}'.format("|  ++" + value, w=width))
                elif sub_i == rmax and index == cmax_list[sub_i]:
                    file.write('{:{w}}'.format("|  *+" + value, w=width))
                    file.write('{:{w}}'.format("|    " + value, w=width))
            file.write("\n" + "-" * width * (len(distance_list[0]) + 1) + "\n")

    def show_results_self(self, distance_list, file):
        Show the results from the comp_topic_lists method
        :param distance_list: a 2D list of data
        width = 14
            "\ntopic List 1 is vertical and topic List 2 is horizontal\n")

        # calculate smallest results
        # show row labels
        file.write("{:{w}}".format("T_List1/T_List2", w=width))
        for value in range(len(distance_list[0])):
            file.write('{:{w}}'.format("     topic" + str(value), w=width))

        for index, sublist in enumerate(distance_list):
            # show column labels
            file.write('{:{w}}'.format("    topic" + str(index), w=width))
            # find row and col max
            rmax = self.find_largest_one(sublist)

            for sub_i, value in enumerate(sublist):
                value = '{0:.6f}'.format(value)
                if sub_i == rmax:
                    file.write('{:{w}}'.format("|  **" + value, w=width))
                    file.write('{:{w}}'.format("|    " + value, w=width))
            file.write("\n" + "-" * width * (len(distance_list[0]) + 1) + "\n")

    def show_results_2min_self(self, distance_list, file):
        Show the results from the comp_topic_lists method
        :param distance_list: a 2D list of data
        width = 14
            "\ntopic List 1 is vertical and topic List 2 is horizontal\n")

        # show row labels
        file.write("{:{w}}".format("T_List1/T_List2", w=width))
        for value in range(len(distance_list[0])):
            file.write('{:{w}}'.format("     topic" + str(value), w=width))

        for index, sublist in enumerate(distance_list):
            # show column labels
            file.write('{:{w}}'.format("    topic" + str(index), w=width))
            # show data in each cell
            min_1, min_2 = self.find_smallest(sublist)
            for sub_i, value in enumerate(sublist):
                value = '{0:.6f}'.format(value)
                if sub_i == min_1:
                    file.write('{:{w}}'.format("|##" + value, w=width))
                elif sub_i == min_2:
                    file.write('{:{w}}'.format("| #" + value, w=width))
                    file.write('{:{w}}'.format("|   " + value, w=width))
            file.write("\n" + "-" * width * (len(distance_list[0]) + 1) + "\n")

    def show_results_2max_self(self, distance_list, file):
        Show the results from the comp_topic_lists method
        :param distance_list: a 2D list of data
        width = 14
            "\ntopic List 1 is vertical and topic List 2 is horizontal\n")

        # show row labels
        file.write("{:{w}}".format("T_List1/T_List2", w=width))
        for value in range(len(distance_list[0])):
            file.write('{:{w}}'.format("     topic" + str(value), w=width))

        for index, sublist in enumerate(distance_list):
            # show column labels
            file.write('{:{w}}'.format("    topic" + str(index), w=width))
            # show data in each cell
            max_1, max_2 = self.find_largest_two(sublist)
            for sub_i, value in enumerate(sublist):
                value = '{0:.6f}'.format(value)
                if sub_i == max_1:
                    file.write('{:{w}}'.format("|   " + value, w=width))
                elif sub_i == max_2:
                    file.write('{:{w}}'.format("| **" + value, w=width))
                    file.write('{:{w}}'.format("|   " + value, w=width))
            file.write("\n" + "-" * width * (len(distance_list[0]) + 1) + "\n")

    def write_distance(self, distance_list, ofile):
        for i1, sublist in enumerate(distance_list):
            for i2, value in enumerate(sublist[i1 + 1:]):
                ofile.write(str(value) + "\n")

    def read_distance_list(self, ifile):
        dist_list = []
        for line in ifile:
        return dist_list

    def read_distance_rank(self, dist_list, topics_count, corpus_type):
        dist_rank = []

        index = 0
        for t1 in range(topics_count):
            for t2 in range(t1 + 1, topics_count):
                    (corpus_type + "_t" + str(topics_count) + "t" + str(t1) +
                     "_t" + str(t2), dist_list[index]))
                index += 1

        return dist_rank
예제 #2
class BCInterpreter:
    A helper class used in the interpretation of BC Distance
    def __init__(self):
        self.bc = Similarity()

    def bc_similarity(self,
        Calculate the average bc_distance between two similar distributions with certain parameters
        :param seq_size: the size of the sequence (population)
        :param sample_size: the size of the sample
        :param sample_times: times of sampling
        :param degree: degree of randomness.
        The smaller the number, the more sparse the distribution is.
        The larger the number, the  more uniform the distribution is.
        :param bc_times: times to calculate bc_distance
        :return: an averaged bc_distance value
        bsum = 0
        for num in range(bc_times):
            dist1 = list(numpy.random.dirichlet([degree] * seq_size))
            dist2 = self.mean_rand_dist(dist1, seq_size, sample_size,
            bsum += self.bc.bha_distance(self.dist_to_topic(dist1),
        return bsum / bc_times

    def bc_difference(self, seq_size, degree1, degree2, bc_times=20):
        bsum = 0
        for num in range(bc_times):
            dist1 = list(numpy.random.dirichlet([degree1] * seq_size))
            dist2 = list(numpy.random.dirichlet([degree2] * seq_size))
            bsum += self.bc.bha_distance(self.dist_to_topic(dist1),
        return bsum / bc_times

    def mean_rand_dist(self, dist, seq_size, sample_size=30, sample_times=50):

        :param dist:
        :param seq_size:
        :param sample_size:
        :param sample_times:
        dist_list = []
        for num in range(sample_times):
            dist_list.append(self.rand_dist(dist, seq_size, sample_size))

        mean_dist = []
        for num in range(seq_size):
            mean_dist.append(sum([d[num] for d in dist_list]) / sample_times)
        return mean_dist

    def rand_dist(self, dist, seq_size, sample_size):
        Get a new distribution on a random sample from a population with a specific distribution
        :param dist: the distribution of the population
        :param seq_size: the range of the population
        :param sample_size: the size of the sample
        # get the random sample
        sample = numpy.random.choice(seq_size, sample_size, p=dist)

        # calculate distribution
        sample_dist = []
        for num in range(seq_size):
                float(list(sample).count(num)) / float(sample_size))
        return sample_dist

    def dist_to_topic(self, dist):
        Output a topic object with the input distribution
        :param dist: a distribution list
        :return: a topic object
        topic = topicio.Topic()
        topic.words_dist = [(index, num) for index, num in enumerate(dist)]
        return topic