Python write_mannwithneyu示例

编程语言: Python

命名空间/包名称: util

方法/功能: write_mannwithneyu

hotexamples.com的示例: 4

Python write_mannwithneyu - 已找到4个示例。这些是从开源项目中提取的最受好评的util.write_mannwithneyu现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： GlobalWikipediaPopularity.py 项目： clauwag/WikipediaGenderInequality

    def analyze_num_lang_edition(self, classname, people):
        path = "img/" + classname

        men = people[people.gender == "male"]
        women = people[people.gender == "female"]

        # create folder if not exist
        if not os.path.exists(path):
            os.makedirs(path)
        self.logfile.write("\n\n\n\ " + classname + "\n")
        edition_counts = np.append(women.edition_count.values, men.edition_count.values)
        labels = ["female (" + str(len(women.index)) + ")", "male (" + str(len(men.index)) + ")"]

        # print edition_counts
        # print type(edition_counts)probability that a score randomly drawn from population A will be greater than a score randomly drawn from population B.
        max_num_editions = np.max(edition_counts)
        # print max_num_editions
        top_men = []
        men_percentage = 0
        topk = range(10, 110, 10)

        if men.shape[0] > 0:
            men_percentage = [
                men[men.edition_count == x].shape[0] / float(men.shape[0]) for x in range(1, max_num_editions)
            ]
            self.logfile.write("\n % of local men ")
            self.logfile.write(str(men[men.edition_count < 2].shape[0] / float(men.shape[0])))
            self.logfile.write(
                "\n percentage men %s" % str(len(men.index) / float((len(women.index) + len(men.index))))
            )
            men_vals = men.edition_count.order(ascending=True).values

        women_percentage = 0
        if women.shape[0] > 0:
            women_percentage = [
                women[women.edition_count == x].shape[0] / float(women.shape[0]) for x in range(1, max_num_editions)
            ]
            self.logfile.write("\n\n % local women ")
            self.logfile.write(str(women[women.edition_count < 2].shape[0] / float(women.shape[0])))
            self.logfile.write(
                "\n percentage women %s" % str(len(women.index) / float((len(women.index) + len(men.index))))
            )
            women_vals = women.edition_count.order(ascending=True).values

        if women.shape[0] > 0 and men.shape[0] > 0:
            j_m = 0
            j_w = 0

            U, p = stats.mstats.mannwhitneyu(women.edition_count, men.edition_count)
            ut.write_mannwithneyu(U, p, women.edition_count, men.edition_count, self.logfile)

            # for i in topk:
            #    perc_men = len(men.edition_count.index) * i/100.0
            #    print "bottom %s percent of men are %s  - mean %s - median %s"% (str(i), str(perc_men), np.mean(men_vals[:int(perc_men)]), np.median(men_vals[:int(perc_men)]))
            #    top_men.append(men_vals[:int(perc_men)])
            #    perc_women = len(women.edition_count.index) * i/100.0
            #    print "bottom %s percent of women are %s  - mean %s - median %s"% (str(i), str(perc_women), np.mean(women_vals[:int(perc_men)]), np.median(women_vals[:int(perc_men)]))
            #    top_women.append(women_vals[:int(perc_women)])
            #    j_m += int(perc_men)
            #    j_w +=  int(perc_women)

            # ut.plotTopk(top_women, top_men, ['pink', 'blue'], topk, path+'/numedition_gender_topk.png')
            ut.plot_percentage(
                women_percentage,
                men_percentage,
                ["pink", "blue"],
                range(1, max_num_editions),
                path + "/numedition_gender_percentage" + self.pre + "-" + self.post + ".png",
            )
            self.genderBoxplots(women, men, labels, path)

            self.logfile.write("\n\n num women %s" % len(women.index))
            self.logfile.write("\n num men %s" % len(men.index))

            data = [women.edition_count.values, men.edition_count.values]

            # Compute the qth percentile of women and men
            q75 = [np.percentile(x, q=75) for x in data]

            self.logfile.write("\n third quartil (75th percentile):  " + str(q75))
            q95 = [np.percentile(x, q=95) for x in data]
            self.logfile.write("\n 95th percentile: " + str(q95))
            q99 = [np.percentile(x, q=99) for x in data]
            self.logfile.write("\n 99th percentile: " + str(q99))

            q99_women = q99[0]
            q99_men = q99[1]
            self.logfile.write("\n threshold women 99th percentile: %s" % q99_women)
            self.logfile.write("\n threshold men 99th percentile: %s" % q99_men)
            men_percentage = 0
            th = np.min(q99)

            if men.shape[0] > 0:
                men_percentage = [men[men.edition_count == x].shape[0] / float(men.shape[0]) for x in range(1, int(th))]
            women_percentage = 0
            if women.shape[0] > 0:
                women_percentage = [
                    women[women.edition_count == x].shape[0] / float(women.shape[0]) for x in range(1, int(th))
                ]

            ut.plot_percentage(
                women_percentage,
                men_percentage,
                ["pink", "blue"],
                range(1, int(th)),
                path + "/numedition_gender_percentage" + self.pre + "-" + self.post + "_99.png",
            )

            # RANDOM BASELINE FOR RATIO
            fake_ratios_norm = list()
            fake_ratios = list()
            for i in range(1, 1000):
                # print (people.gender.value_counts())
                people["random_gender"] = pd.Series(np.random.permutation(people.gender.values), index=people.index)
                # print (people.random_gender.value_counts())

                fake_men = people[people.random_gender == "male"]
                fake_women = people[people.random_gender == "female"]
                item_frequency_fake_female = itemfreq(np.array(fake_women["edition_count"].values.tolist()))
                item_frequency_fake_male = itemfreq(np.array(fake_men["edition_count"].values.tolist()))
                fake_ratios_norm.append(self.get_ratio(item_frequency_fake_female, item_frequency_fake_male, True))
                fake_ratios.append(self.get_ratio(item_frequency_fake_female, item_frequency_fake_male, True))

            item_frequency_female = itemfreq(np.array(women["edition_count"].values.tolist()))
            item_frequency_male = itemfreq(np.array(men["edition_count"].values.tolist()))
            ratio_norm = self.get_ratio(item_frequency_female, item_frequency_male, True)

            mean_fake_ratio_norm = {}
            mean_fake_ratio = {}
            for key in ratio_norm.keys():
                vals = []
                vals_norm = []
                for dic1 in fake_ratios_norm:
                    if key in dic1:
                        vals_norm.append(dic1.get(key))
                mean_fake_ratio_norm[key] = np.mean(vals_norm)
                for dic2 in fake_ratios:
                    if key in dic2:
                        vals.append(dic2.get(key))
                mean_fake_ratio[key] = np.mean(vals)

            # if we plor normalized ratio we should take the log since otherwise upper boun dis 0 but ratio can become extremly small.
            # ut.plotratio(ratio_norm, mean_fake_ratio_norm, ['g','r--'], ['empirical gender', 'random gender'], self.pre+"-"+self.post, path+'/numedition_gender_ratio'+self.pre+"-"+self.post+'_norm.png', 'Num Editions', 'Male Proportion/Female Proportion', False, False)
            ratio = self.get_ratio(item_frequency_female, item_frequency_male, False)
            lowess = sm.nonparametric.lowess(ratio.values(), ratio.keys(), frac=0.1)
            ut.plotratio(
                ratio,
                lowess,
                mean_fake_ratio,
                ["b^", "g", "r--"],
                ["empirical gender", "lowess fit", "random gender"],
                self.pre + "-" + self.post,
                path + "/numedition_gender_ratio" + self.pre + "-" + self.post + ".png",
                "Num Editions",
                "Male/Female",
                False,
                False,
            )

            # ratio = self.get_ratio(item_frequency_female, item_frequency_male, True)
            # ut.plotline(list(ratio.keys()), list(ratio.values()),  ['pink','blue'], path+'/numedition_gender_ratio'+self.pre+'_norm.png', 'Num Editions', 'Female-Male-Ratio', False, False)
            # ratio = self.get_ratio(item_frequency_female, item_frequency_male, False)
            # ut.plotline(list(ratio.keys()), list(ratio.values()),  ['pink','blue'], path+'/numedition_gender_ratio'+self.pre+'.png', 'Num Editions', 'Female-Male-Ratio', False, False)

            # ut.plot_rank_size(list([item_frequency_female[:np.max(q99)], item_frequency_male[:np.max(q99)]]), labels, ['pink','blue'], path+'/numedition_gender_ranksize_99.png', 'Rank', 'Num Editions', False, True)

            # print "Mann Withney U Test Frequ Dist:"
            # print stats.mstats.mannwhitneyu(item_frequency_female, item_frequency_male)
            # print stats.ranksums(item_frequency_female, item_frequency_male)

            ut.plot_cdf(
                list([item_frequency_female, item_frequency_male]),
                labels,
                ["pink", "blue"],
                path + "/numedition_gender_ccdf" + self.pre + "-" + self.post + ".png",
                "Num Editions",
                True,
                False,
                True,
            )
            # ut.plot_cdf(list([item_frequency_female[:np.max(q95)], item_frequency_male[:np.max(q95)]]), labels, ['pink','blue'], path+'/numedition_gender_ccdf_95.png', 'Num Editions', True, False, True)
            # ut.plot_cdf(list([item_frequency_female[:np.max(q99)], item_frequency_male[:np.max(q99)]]), labels, ['pink','blue'], path+'/numedition_gender_ccdf_99.png', 'Num Editions', True, False, True)

            self.logfile.write("\n\n men median(men mean), women median (women mean)")
            self.logfile.write(
                "\n "
                + str(np.median(men.edition_count.values))
                + "("
                + str(np.mean(men.edition_count.values))
                + "), "
                + str(np.median(women.edition_count.values))
                + "("
                + str(np.mean(women.edition_count.values))
                + ")"
            )

            return {
                "class": classname,
                "median-men": np.median(men.edition_count.values),
                "mean-men": np.mean(men.edition_count.values),
                "sem-men": stats.sem(men.edition_count.values),
                "sem-women": stats.sem(women.edition_count.values),
                "median-women": np.median(women.edition_count.values),
                "mean-women": np.mean(women.edition_count.values),
            }

示例#2

显示文件

    def analyze_num_lang_edition(self, classname, people):
        path = "img/" + classname

        men = people[people.gender == "male"]
        women = people[people.gender == "female"]

        #create folder if not exist
        if not os.path.exists(path):
            os.makedirs(path)
        self.logfile.write("\n\n\n\ " + classname + "\n")
        edition_counts = np.append(women.edition_count.values,
                                   men.edition_count.values)
        labels = [
            'female (' + str(len(women.index)) + ')',
            'male (' + str(len(men.index)) + ')'
        ]

        #print edition_counts
        #print type(edition_counts)probability that a score randomly drawn from population A will be greater than a score randomly drawn from population B.
        max_num_editions = np.max(edition_counts)
        #print max_num_editions
        top_men = []
        men_percentage = 0
        topk = range(10, 110, 10)

        if (men.shape[0] > 0):
            men_percentage = [
                men[men.edition_count == x].shape[0] / float(men.shape[0])
                for x in range(1, max_num_editions)
            ]
            self.logfile.write("\n % of local men ")
            self.logfile.write(
                str(men[men.edition_count < 2].shape[0] / float(men.shape[0])))
            self.logfile.write("\n percentage men %s" % str(
                len(men.index) / float((len(women.index) + len(men.index)))))
            men_vals = men.edition_count.order(ascending=True).values

        women_percentage = 0
        if (women.shape[0] > 0):
            women_percentage = [
                women[women.edition_count == x].shape[0] /
                float(women.shape[0]) for x in range(1, max_num_editions)
            ]
            self.logfile.write("\n\n % local women ")
            self.logfile.write(
                str(women[women.edition_count < 2].shape[0] /
                    float(women.shape[0])))
            self.logfile.write("\n percentage women %s" % str(
                len(women.index) / float((len(women.index) + len(men.index)))))
            women_vals = women.edition_count.order(ascending=True).values

        if (women.shape[0] > 0 and men.shape[0] > 0):
            j_m = 0
            j_w = 0

            U, p = stats.mstats.mannwhitneyu(women.edition_count,
                                             men.edition_count)
            ut.write_mannwithneyu(U, p, women.edition_count, men.edition_count,
                                  self.logfile)

            #for i in topk:
            #    perc_men = len(men.edition_count.index) * i/100.0
            #    print "bottom %s percent of men are %s  - mean %s - median %s"% (str(i), str(perc_men), np.mean(men_vals[:int(perc_men)]), np.median(men_vals[:int(perc_men)]))
            #    top_men.append(men_vals[:int(perc_men)])
            #    perc_women = len(women.edition_count.index) * i/100.0
            #    print "bottom %s percent of women are %s  - mean %s - median %s"% (str(i), str(perc_women), np.mean(women_vals[:int(perc_men)]), np.median(women_vals[:int(perc_men)]))
            #    top_women.append(women_vals[:int(perc_women)])
            #    j_m += int(perc_men)
            #    j_w +=  int(perc_women)

            #ut.plotTopk(top_women, top_men, ['pink', 'blue'], topk, path+'/numedition_gender_topk.png')
            ut.plot_percentage(
                women_percentage, men_percentage, ['pink', 'blue'],
                range(1, max_num_editions),
                path + '/numedition_gender_percentage' + self.pre + "-" +
                self.post + '.png')
            self.genderBoxplots(women, men, labels, path)

            self.logfile.write("\n\n num women %s" % len(women.index))
            self.logfile.write("\n num men %s" % len(men.index))

            data = [women.edition_count.values, men.edition_count.values]

            # Compute the qth percentile of women and men
            q75 = [np.percentile(x, q=75) for x in data]

            self.logfile.write("\n third quartil (75th percentile):  " +
                               str(q75))
            q95 = [np.percentile(x, q=95) for x in data]
            self.logfile.write("\n 95th percentile: " + str(q95))
            q99 = [np.percentile(x, q=99) for x in data]
            self.logfile.write("\n 99th percentile: " + str(q99))

            q99_women = q99[0]
            q99_men = q99[1]
            self.logfile.write("\n threshold women 99th percentile: %s" %
                               q99_women)
            self.logfile.write("\n threshold men 99th percentile: %s" %
                               q99_men)
            men_percentage = 0
            th = np.min(q99)

            if (men.shape[0] > 0):
                men_percentage = [
                    men[men.edition_count == x].shape[0] / float(men.shape[0])
                    for x in range(1, int(th))
                ]
            women_percentage = 0
            if (women.shape[0] > 0):
                women_percentage = [
                    women[women.edition_count == x].shape[0] /
                    float(women.shape[0]) for x in range(1, int(th))
                ]

            ut.plot_percentage(
                women_percentage, men_percentage, ['pink', 'blue'],
                range(1, int(th)), path + '/numedition_gender_percentage' +
                self.pre + "-" + self.post + '_99.png')

            # RANDOM BASELINE FOR RATIO
            fake_ratios_norm = list()
            fake_ratios = list()
            for i in range(1, 1000):
                #print (people.gender.value_counts())
                people["random_gender"] = pd.Series(np.random.permutation(
                    people.gender.values),
                                                    index=people.index)
                #print (people.random_gender.value_counts())

                fake_men = people[people.random_gender == "male"]
                fake_women = people[people.random_gender == "female"]
                item_frequency_fake_female = itemfreq(
                    np.array(fake_women['edition_count'].values.tolist()))
                item_frequency_fake_male = itemfreq(
                    np.array(fake_men['edition_count'].values.tolist()))
                fake_ratios_norm.append(
                    self.get_ratio(item_frequency_fake_female,
                                   item_frequency_fake_male, True))
                fake_ratios.append(
                    self.get_ratio(item_frequency_fake_female,
                                   item_frequency_fake_male, True))

            item_frequency_female = itemfreq(
                np.array(women['edition_count'].values.tolist()))
            item_frequency_male = itemfreq(
                np.array(men['edition_count'].values.tolist()))
            ratio_norm = self.get_ratio(item_frequency_female,
                                        item_frequency_male, True)

            mean_fake_ratio_norm = {}
            mean_fake_ratio = {}
            for key in ratio_norm.keys():
                vals = []
                vals_norm = []
                for dic1 in fake_ratios_norm:
                    if key in dic1:
                        vals_norm.append(dic1.get(key))
                mean_fake_ratio_norm[key] = np.mean(vals_norm)
                for dic2 in fake_ratios:
                    if key in dic2:
                        vals.append(dic2.get(key))
                mean_fake_ratio[key] = np.mean(vals)

            # if we plor normalized ratio we should take the log since otherwise upper boun dis 0 but ratio can become extremly small.
            #ut.plotratio(ratio_norm, mean_fake_ratio_norm, ['g','r--'], ['empirical gender', 'random gender'], self.pre+"-"+self.post, path+'/numedition_gender_ratio'+self.pre+"-"+self.post+'_norm.png', 'Num Editions', 'Male Proportion/Female Proportion', False, False)
            ratio = self.get_ratio(item_frequency_female, item_frequency_male,
                                   False)
            lowess = sm.nonparametric.lowess(ratio.values(),
                                             ratio.keys(),
                                             frac=0.1)
            ut.plotratio(
                ratio, lowess, mean_fake_ratio, ['b^', 'g', 'r--'],
                ['empirical gender', 'lowess fit', 'random gender'],
                self.pre + "-" + self.post, path + '/numedition_gender_ratio' +
                self.pre + "-" + self.post + '.png', 'Num Editions',
                'Male/Female', False, False)

            #ratio = self.get_ratio(item_frequency_female, item_frequency_male, True)
            #ut.plotline(list(ratio.keys()), list(ratio.values()),  ['pink','blue'], path+'/numedition_gender_ratio'+self.pre+'_norm.png', 'Num Editions', 'Female-Male-Ratio', False, False)
            #ratio = self.get_ratio(item_frequency_female, item_frequency_male, False)
            #ut.plotline(list(ratio.keys()), list(ratio.values()),  ['pink','blue'], path+'/numedition_gender_ratio'+self.pre+'.png', 'Num Editions', 'Female-Male-Ratio', False, False)

            #ut.plot_rank_size(list([item_frequency_female[:np.max(q99)], item_frequency_male[:np.max(q99)]]), labels, ['pink','blue'], path+'/numedition_gender_ranksize_99.png', 'Rank', 'Num Editions', False, True)

            #print "Mann Withney U Test Frequ Dist:"
            #print stats.mstats.mannwhitneyu(item_frequency_female, item_frequency_male)
            #print stats.ranksums(item_frequency_female, item_frequency_male)

            ut.plot_cdf(
                list([item_frequency_female, item_frequency_male]), labels,
                ['pink', 'blue'], path + '/numedition_gender_ccdf' + self.pre +
                "-" + self.post + '.png', 'Num Editions', True, False, True)
            #ut.plot_cdf(list([item_frequency_female[:np.max(q95)], item_frequency_male[:np.max(q95)]]), labels, ['pink','blue'], path+'/numedition_gender_ccdf_95.png', 'Num Editions', True, False, True)
            #ut.plot_cdf(list([item_frequency_female[:np.max(q99)], item_frequency_male[:np.max(q99)]]), labels, ['pink','blue'], path+'/numedition_gender_ccdf_99.png', 'Num Editions', True, False, True)

            self.logfile.write(
                "\n\n men median(men mean), women median (women mean)")
            self.logfile.write("\n " +
                               str(np.median(men.edition_count.values)) + '(' +
                               str(np.mean(men.edition_count.values)) + '), ' +
                               str(np.median(women.edition_count.values)) +
                               '(' + str(np.mean(women.edition_count.values)) +
                               ')')

            return {
                "class": classname,
                "median-men": np.median(men.edition_count.values),
                "mean-men": np.mean(men.edition_count.values),
                "sem-men": stats.sem(men.edition_count.values),
                "sem-women": stats.sem(women.edition_count.values),
                "median-women": np.median(women.edition_count.values),
                "mean-women": np.mean(women.edition_count.values)
            }

示例#3

显示文件

    def run(self):
        regionalEntropy = {}
        regionCount = {}
        timeEntropy = {}
        sumInterest = {}
        posInterest = {}


        ##############################################################################
        # PARSE GOOGLE TREND RESULT FILES
        ##############################################################################
        quota_error = 0
        for file in self.onlyfiles:
            startFromLine = -1
            startFromLineTime = -1

            pos = file.find(".csv")
            filename = file[0:pos]

            linesCounter = 1
            end = False
            endTime = False

            with open(self.datapath+"/"+file) as f:
                content = f.readlines()
                regions = {}
                timeseries = {}
                for line in content:
                    if line.startswith("<div id="):
                        quota_error += 1
                        print "quota error for %s"%filename
                        break;
                    if line.startswith("Region,"):
                        startFromLine = linesCounter
                    if line.startswith("Month,"):
                        startFromLineTime = linesCounter

                    if ((startFromLine > 0) and (linesCounter > startFromLine) and (not end)):
                         if line == "\n":
                            end = True
                         else:
                            items  = line.split(",")
                            regions[items[0]] = items[1]

                    if ((startFromLineTime > 0) and (linesCounter > startFromLineTime) and (not endTime)):
                        print line
                        if line == "\n":
                            endTime = True

                        else:
                            items  = line.split(",")
                            if items[1] == ' \n': # sometimes gtrends returns empty field rather than 0
                                timeseries[items[0]] = "0"
                            else:
                                timeseries[items[0]] = items[1]
                    linesCounter += 1


            timeFrequs = map(int, timeseries.values())
            regionFrequs = map(int,(regions.values()))

            if linesCounter > 1:
                sumInterest[filename] = np.sum(timeFrequs)
                posInterest[filename] = np.count_nonzero(timeFrequs)

                if posInterest[filename] > 0:
                    timeEntropy[filename] = stats.entropy(timeFrequs)
                else:
                    timeEntropy[filename] = np.nan

                regionCount[filename] = len(regionFrequs)
                if(np.sum(regionFrequs) > 0):
                    regionalEntropy[filename] = stats.entropy(regionFrequs)
                else:
                    regionalEntropy[filename] = np.nan


        # store results into a dataframe
        regionalEntropyDF = pd.DataFrame.from_dict(regionalEntropy.items())
        regionalEntropyDF.columns=["filename", "entropy"]
        print regionalEntropyDF.head()

        regionCountDF = pd.DataFrame.from_dict(regionCount.items())
        regionCountDF.columns=["filename", "numRegions"]

        interestDF = pd.DataFrame.from_dict(sumInterest.items())
        interestDF.columns=["filename", "timeInterest"]

        timeEntropyDF = pd.DataFrame.from_dict(timeEntropy.items())
        timeEntropyDF.columns=["filename", "timeEntropy"]

        posInterestDF = pd.DataFrame.from_dict(posInterest.items())
        posInterestDF.columns=["filename", "timePosInterest"]


        #print "regionalEntropyDF"
        #print regionalEntropyDF.head(n=1)
        #print regionalEntropyDF.shape
        #print "regionCountDF"
        #print regionCountDF.head(n=1)
        #print regionCountDF.shape
        #print "self.people"
        #print self.people.head(n=1)
        #print self.people.shape

        # add the computed statistics to the people file
        self.people = self.people.merge(regionalEntropyDF, right_on="filename", left_on="filename", how="inner")
        self.people = self.people.merge(regionCountDF, right_on="filename", left_on="filename", how="inner")
        self.people = self.people.merge(timeEntropyDF, right_on="filename", left_on="filename", how="inner")
        self.people = self.people.merge(interestDF, right_on="filename", left_on="filename", how="inner")
        self.people = self.people.merge(posInterestDF, right_on="filename", left_on="filename", how="inner")


        print "AFTER MERGING"
        print self.people.head(n=1)
        print self.people.shape




        ##############################################################################
        # PLOTS NUM REGIONS
        ##############################################################################

        men = self.people[self.people.gender =="male"]
        women = self.people[self.people.gender =="female"]

        labels = ['female ('+str(len(women.index))+')', 'male ('+str(len(men.index))+')']
        data = [women.numRegions.values, men.numRegions.values]

        self.logfile.write("\n Mann Withney U Num regions:")
        U, p =  stats.mstats.mannwhitneyu(women.numRegions.values, men.numRegions.values)
        ut.write_mannwithneyu(U, p, women.numRegions.values, men.numRegions.values, self.logfile)
        self.make_boxplot(data, labels, self.imgpath+"gtrend_num_regions_box.png", "num regions")
        self.plot_ccdf(np.array(women.numRegions.values.tolist()), np.array(men.numRegions.values.tolist()), labels, self.imgpath+"gtrend_num_regions_ccdf.png", "Num Regions", 1, 0)
        ut.plot_facet_dist(self.people, 'gender', 'numRegions', self.imgpath+"gtrend_num_regions.png")
        ut.rank_size_plot(self.people, 'numRegions', 'Num Regions Gtrends',  self.imgpath+"gtrend_num_regions_ranksize.png")

        ##############################################################################
        # PLOTS TOTAL INTEREST
        ##############################################################################

        data = [women.timeInterest.values, men.timeInterest.values]
        self.logfile.write("\n \n Mann Withney U Temp Sum Interest:")
        U, p =  stats.mstats.mannwhitneyu(women.timeInterest.values, men.timeInterest.values)
        ut.write_mannwithneyu(U, p, women.timeInterest.values, men.timeInterest.values, self.logfile)
        self.make_boxplot(data, labels, self.imgpath+"gtrend_time_interest_box.png", "sum interest")
        self.plot_ccdf(np.array(women.timeInterest.values.tolist()), np.array(men.timeInterest.values.tolist()), labels, self.imgpath+"gtrend_time_interest_ccdf.png", "Sum Interest", 1, 0)
        ut.plot_facet_dist(self.people, 'gender', 'timeInterest', self.imgpath+"gtrend_time_interest.png")

        data = [women.timePosInterest.values, men.timePosInterest.values]
        self.logfile.write("\n\n Mann Withney U Temp Pos Interest:")
        U, p =  stats.mstats.mannwhitneyu(women.timePosInterest.values, men.timePosInterest.values)
        ut.write_mannwithneyu(U, p, women.timePosInterest.values, men.timePosInterest.values, self.logfile)
        self.make_boxplot(data, labels, self.imgpath+"gtrend_time_pos_interest_box.png", "num weeks with interest")
        self.plot_ccdf(np.array(women.timePosInterest.values.tolist()), np.array(men.timePosInterest.values.tolist()), labels, self.imgpath+"gtrend_time_pos_interest_ccdf.png", "Num weeks with interest", 1, 0)
        ut.plot_facet_dist(self.people, 'gender', 'timePosInterest', self.imgpath+"gtrend_time_pos_interest.png")


        ##############################################################################
        # PLOT Entropy Temp INTEREST
        ##############################################################################
        limPeople = self.people[np.isfinite(self.people['timeEntropy'])] #people[people.index not in inds]
        men = limPeople[limPeople.gender =="male"]
        women = limPeople[limPeople.gender =="female"]
        data = [women.timeEntropy.values, men.timeEntropy.values]
        self.logfile.write("\n\n Mann Withney U Time Entropy:")
        U, p =  stats.mstats.mannwhitneyu(women.timeEntropy.values, men.timeEntropy.values)
        ut.write_mannwithneyu(U, p, women.timeEntropy.values, men.timeEntropy.values, self.logfile)
        self.make_boxplot(data, labels, self.imgpath+"gtrend_time_entropy_box.png", "temporal entropy")
        self.plot_ccdf(np.array(women.timeEntropy.values.tolist()), np.array(men.timeEntropy.values.tolist()), labels, self.imgpath+"gtrend_time_entropy_ccdf.png", "Temp Entropy", 1, 0)
        ut.plot_facet_dist(self.people, 'gender', 'timeEntropy', self.imgpath+"gtrend_time_entropy.png")


        ##############################################################################
        # PLOT ENTROPY
        ##############################################################################
        # for entropy we need to remove the nan value. If we dont have data the result is nan
        limPeople = self.people[np.isfinite(self.people['entropy'])] #people[people.index not in inds]
        men = limPeople[limPeople.gender =="male"]
        women = limPeople[limPeople.gender =="female"]
        labels = ['female ('+str(len(women.index))+')', 'male ('+str(len(men.index))+')']
        data = [women.entropy.values, men.entropy.values]

        self.logfile.write("\n\n Mann Withney U Entropy:")
        U, p =  stats.mstats.mannwhitneyu(women.entropy.values, men.entropy.values)
        ut.write_mannwithneyu(U, p, women.entropy.values, men.entropy.values, self.logfile)
        self.make_boxplot(data, labels, "gtrend_region_entropy_box.png", "shannon entropy")
        self.plot_ccdf(np.array(women.entropy.values.tolist()), np.array(men.entropy.values.tolist()), labels, self.imgpath+"gtrend_entropy_ccdf.png", "Entropy", 0, 0)
        ut.plot_facet_dist(self.people, 'gender', 'entropy', self.imgpath+"gtrend_region_entropy.png")


        self.regression()

示例#4

显示文件

文件： GoogleTrendAnalyzerJSON.py 项目： clauwag/WikipediaGenderInequality

    def run(self):
        regionalEntropy = {}
        regionCount = {}
        timeEntropy = {}
        sumInterest = {}
        posInterest = {}


        ##############################################################################
        # PARSE GOOGLE TREND RESULT FILES
        ##############################################################################
        quota_error = 0
        monthlyFiles = 0
        weeklyFiles = 0

        for file in self.onlyfiles:
            startFromLine = -1
            startFromLineTime = -1
            weekly = False
            pos = file.find(".json")
            if (pos >= 0):
                filename = file[0:pos]

                linesCounter = 1
                end = False
                endTime = False


                with open(self.datapath+"/"+file) as f:
                    content = f.read()
                    #print content
                    data = self.parse_gtrend_json(content)

                    timeseries = {}

                print data
                print len(data)
                if len(data) > 3:
                    for ind in xrange(0, len(data), 2):
                        #print data[ind]
                        #print data[ind+1]
                        if (ind+1) < len(data):
                            timeseries[data[ind]] = data[ind+1]


                timeFrequs = map(int, timeseries.values())

                if len(timeFrequs) > 1:
                    sumInterest[filename] = np.sum(timeFrequs)
                    posInterest[filename] = np.count_nonzero(timeFrequs)

                    if posInterest[filename] > 0:
                        timeEntropy[filename] = stats.entropy(timeFrequs)
                    else:
                        timeEntropy[filename] = np.nan


        interestDF = pd.DataFrame.from_dict(sumInterest.items())
        interestDF.columns=["filename", "timeInterest"]

        timeEntropyDF = pd.DataFrame.from_dict(timeEntropy.items())
        timeEntropyDF.columns=["filename", "timeEntropy"]

        posInterestDF = pd.DataFrame.from_dict(posInterest.items())
        posInterestDF.columns=["filename", "timePosInterest"]


        #print "regionalEntropyDF"
        #print regionalEntropyDF.head(n=1)
        #print regionalEntropyDF.shape
        #print "regionCountDF"
        #print regionCountDF.head(n=1)
        #print regionCountDF.shape
        #print "self.people"
        #print self.people.head(n=1)
        #print self.people.shape

        # add the computed statistics to the people file
        self.people = self.people.merge(timeEntropyDF, right_on="filename", left_on="filename", how="inner")
        self.people = self.people.merge(interestDF, right_on="filename", left_on="filename", how="inner")
        self.people = self.people.merge(posInterestDF, right_on="filename", left_on="filename", how="inner")


        print "AFTER MERGING"
        print self.people.head(n=1)
        print self.people.shape





        men = self.people[self.people.gender =="male"]
        women = self.people[self.people.gender =="female"]

        labels = ['female ('+str(len(women.index))+')', 'male ('+str(len(men.index))+')']

        ##############################################################################
        # PLOTS TOTAL INTEREST
        ##############################################################################

        data = [women.timeInterest.values, men.timeInterest.values]
        self.logfile.write("\n \n Mann Withney U Temp Sum Interest:")
        U, p =  stats.mstats.mannwhitneyu(women.timeInterest.values, men.timeInterest.values)
        ut.write_mannwithneyu(U, p, women.timeInterest.values, men.timeInterest.values, self.logfile)
        self.make_boxplot(data, labels, self.imgpath+"gtrend_time_interest_box.png", "sum interest")
        self.plot_ccdf(np.array(women.timeInterest.values.tolist()), np.array(men.timeInterest.values.tolist()), labels, self.imgpath+"gtrend_time_interest_ccdf.png", "Sum Interest", 1, 0)
        ut.plot_facet_dist(self.people, 'gender', 'timeInterest', self.imgpath+"gtrend_time_interest.png")

        data = [women.timePosInterest.values, men.timePosInterest.values]
        self.logfile.write("\n\n Mann Withney U Temp Pos Interest:")
        U, p =  stats.mstats.mannwhitneyu(women.timePosInterest.values, men.timePosInterest.values)
        ut.write_mannwithneyu(U, p, women.timePosInterest.values, men.timePosInterest.values, self.logfile)
        self.make_boxplot(data, labels, self.imgpath+"gtrend_time_pos_interest_box.png", "num weeks with interest")
        self.plot_ccdf(np.array(women.timePosInterest.values.tolist()), np.array(men.timePosInterest.values.tolist()), labels, self.imgpath+"gtrend_time_pos_interest_ccdf.png", "Num weeks with interest", 1, 0)
        ut.plot_facet_dist(self.people, 'gender', 'timePosInterest', self.imgpath+"gtrend_time_pos_interest.png")


        ##############################################################################
        # PLOT Entropy Temp INTEREST
        ##############################################################################
        limPeople = self.people[np.isfinite(self.people['timeEntropy'])] #people[people.index not in inds]
        men = limPeople[limPeople.gender =="male"]
        women = limPeople[limPeople.gender =="female"]
        data = [women.timeEntropy.values, men.timeEntropy.values]
        self.logfile.write("\n\n Mann Withney U Time Entropy:")
        U, p =  stats.mstats.mannwhitneyu(women.timeEntropy.values, men.timeEntropy.values)
        ut.write_mannwithneyu(U, p, women.timeEntropy.values, men.timeEntropy.values, self.logfile)
        self.make_boxplot(data, labels, self.imgpath+"gtrend_time_entropy_box.png", "temporal entropy")
        self.plot_ccdf(np.array(women.timeEntropy.values.tolist()), np.array(men.timeEntropy.values.tolist()), labels, self.imgpath+"gtrend_time_entropy_ccdf.png", "Temp Entropy", 1, 0)
        ut.plot_facet_dist(self.people, 'gender', 'timeEntropy', self.imgpath+"gtrend_time_entropy.png")



        self.regression()