Exemplo n.º 1
0
    def test_update_fdist(self):
        filtered_words = utils.tokenize_and_filter(self.sr)
        fdist = utils.get_freq_dist(filtered_words)
        # take distribution and send it empty list
        fdist2 = update_fdist(fdist, [])
        self.assertEqual(fdist, fdist2)

        time.sleep(5)
        self.g.latitude = 40.734073
        self.g.longitude = -73.990663
        self.g.count = 100
        self.sr = self.g.search()
        filtered_words = utils.tokenize_and_filter(self.sr)
        # updating with entirely new word set -> should be longer
        old_len_fdist = len(fdist)
        fdist = update_fdist(fdist, filtered_words)
        self.assertTrue(len(fdist) > old_len_fdist)
Exemplo n.º 2
0
    def test_update_fdist(self):
        filtered_words = utils.tokenize_and_filter(self.sr)
        fdist = utils.get_freq_dist(filtered_words)
        # take distribution and send it empty list
        fdist2 = update_fdist(fdist, [])
        self.assertEqual(fdist, fdist2)

        time.sleep(5)
        self.g.latitude = 40.734073
        self.g.longitude = -73.990663
        self.g.count = 100
        self.sr = self.g.search()
        filtered_words = utils.tokenize_and_filter(self.sr)
        # updating with entirely new word set -> should be longer
        old_len_fdist = len(fdist)
        fdist = update_fdist(fdist, filtered_words)
        self.assertTrue(len(fdist) > old_len_fdist)
Exemplo n.º 3
0
def updating_plot(geosearchclass, number_of_words, grow=True):
    search_results = geosearchclass.search()
    filtered_words = utils.tokenize_and_filter(search_results)
    fdist = utils.get_freq_dist(filtered_words)
    # set up plot
    samples = [item for item, _ in fdist.most_common(number_of_words)]
    freqs = [fdist[sample] for sample in samples]
    plt.grid(True, color="silver")
    plt.plot(freqs, range(len(freqs)))
    plt.yticks(range(len(samples)), [s for s in samples])
    plt.ylabel("Samples")
    plt.xlabel("Counts")
    plt.title("Top Words Frequency Distribution")
    plt.ion()
    plt.show()

    # set up loop
    old_ids = set([s.id for s in search_results])
    for i in xrange(100):
        plt.pause(5)
        # use mixed above, change to recent here
        geosearchclass.result_type = "recent"
        # perturbation study
        # if i%2:  # for testing purposes
        #     # #change location every odd time to nyc
        #     # geosearchclass.latitude =40.734073
        #     # geosearchclass.longitude =-73.990663
        #     # perturb latitude
        #     geosearchclass.latitude =geosearchclass.latitude + .001

        # else:
        #     #now back to sf
        #     # geosearchclass.latitude = 37.7821
        #     # geosearchclass.longitude =  -122.4093
        #     geosearchclass.longitude =geosearchclass.longitude + .001

        search_results = geosearchclass.search()
        new_search_results = utils.new_tweets(search_results, old_ids)
        if new_search_results:
            filtered_words = utils.tokenize_and_filter(new_search_results)
            fdist = update_fdist(fdist, filtered_words)
            if grow:
                newsamples = [
                    item for item, _ in fdist.most_common(number_of_words)
                ]
                s1 = set(newsamples)
                s2 = set(samples)
                s1.difference_update(s2)
                if s1:
                    print "New words: " + str(list(s1))
                    newsamples = list(s1)
                    samples.extend(newsamples)
                    plt.yticks(range(len(samples)), [s for s in samples])
            freqs = [fdist[sample] for sample in samples]
            plt.plot(freqs, range(len(freqs)))
            if grow:
                plt.draw()
            print '%d new tweet(s)' % len(new_search_results)
            old_ids.update(set([s.id for s in new_search_results]))
        else:
            print "no updates"
Exemplo n.º 4
0
def updating_stream_plot(q, number_of_words=30):
    """This plot uses the streaming API to get real time twitter
    information from a given region, determined by a geo-coordinate
    bounding box. The upper left and lower right determine the
    bounding box.

    q is a queue instance, which holds tweets

    number_of_words determines the average number of words in the
    plot. Once the plot reaches 2 x number_of_words, it is shrunk down
    to the new set of words and starts growing again

    To exit the program early, hit CTRL + Z to stop the python script
    and then CTRL + D twice to kill the terminal process and close the
    window.

    """
    setup = False
    fdist = None
    samples = None
    draw_time = 0.1
    samples = []
    plt.ion()
    plt.grid(True, color="silver")

    for i in range(100000):
        status = q.get()
        search_results = [status]
        while not q.empty():
            print "getting another tweet"
            status = q.get()
            search_results.append(status)

        if not setup:
            print "Gathering enough data to begin plotting"
            while len(samples) < 1:
                status = q.get()
                search_results.append(status)
                filtered_words = utils.tokenize_and_filter(search_results)
                if fdist is None:
                    fdist = utils.get_freq_dist(filtered_words)
                else:
                    fdist = update_fdist(fdist, filtered_words)
                n_words = min(10, len(fdist))
                samples = [item for item, _ in fdist.most_common(n_words)]
                # print "len(samples) = {}".format(len(samples))
                samples = remove_infrequent_words(samples, fdist)
            freqs = [fdist[sample] for sample in samples]
            plt.plot(freqs, range(len(freqs)))
            plt.yticks(range(len(samples)), [s for s in samples])
            plt.ylabel("Samples")
            plt.xlabel("Counts")
            plt.title("Top Words Frequency Distribution")
            plt.show()
            plt.pause(draw_time)
            setup = True

        else:
            filtered_words = utils.tokenize_and_filter(search_results)
            fdist = update_fdist(fdist, filtered_words)
            newsamples = [
                item for item, _ in fdist.most_common(number_of_words)
            ]
            newsamples = remove_infrequent_words(newsamples, fdist)
            s1 = set(newsamples)
            s2 = set(samples)
            s1.difference_update(s2)
            if s1:
                print "New words: " + str(list(s1))
                newsamples = list(s1)
                samples.extend(newsamples)
                if len(samples) > 2 * number_of_words:
                    samples = newsamples
                    plt.close()
                plt.yticks(range(len(samples)), [s for s in samples])
            freqs = [fdist[sample] for sample in samples]
            plt.plot(freqs, range(len(freqs)))
            plt.draw()
            plt.pause(draw_time)
    kill_plot()
    return
Exemplo n.º 5
0
def updating_plot(geosearchclass, number_of_words, grow=True):
    search_results = geosearchclass.search()
    filtered_words = utils.tokenize_and_filter(search_results)
    fdist = utils.get_freq_dist(filtered_words)
    # set up plot
    samples = [item for item, _ in fdist.most_common(number_of_words)]
    freqs = [fdist[sample] for sample in samples]
    plt.grid(True, color="silver")
    plt.plot(freqs, range(len(freqs)))
    plt.yticks(range(len(samples)), [s for s in samples])
    plt.ylabel("Samples")
    plt.xlabel("Counts")
    plt.title("Top Words Frequency Distribution")
    plt.ion()
    plt.show()

    # set up loop
    old_ids = set([s.id for s in search_results])
    for i in xrange(100):
        plt.pause(5)
        # use mixed above, change to recent here
        geosearchclass.result_type = "recent"
        # perturbation study
        # if i%2:  # for testing purposes
        #     # #change location every odd time to nyc
        #     # geosearchclass.latitude =40.734073
        #     # geosearchclass.longitude =-73.990663
        #     # perturb latitude
        #     geosearchclass.latitude =geosearchclass.latitude + .001

        # else:
        #     #now back to sf
        #     # geosearchclass.latitude = 37.7821
        #     # geosearchclass.longitude =  -122.4093
        #     geosearchclass.longitude =geosearchclass.longitude + .001

        search_results = geosearchclass.search()
        new_search_results = utils.new_tweets(search_results, old_ids)
        if new_search_results:
            filtered_words = utils.tokenize_and_filter(new_search_results)
            fdist = update_fdist(fdist, filtered_words)
            if grow:
                newsamples = [item
                              for item, _ in fdist.most_common(number_of_words)
                              ]
                s1 = set(newsamples)
                s2 = set(samples)
                s1.difference_update(s2)
                if s1:
                    print "New words: " + str(list(s1))
                    newsamples = list(s1)
                    samples.extend(newsamples)
                    plt.yticks(range(len(samples)), [s for s in samples])
            freqs = [fdist[sample] for sample in samples]
            plt.plot(freqs, range(len(freqs)))
            if grow:
                plt.draw()
            print '%d new tweet(s)' % len(new_search_results)
            old_ids.update(set([s.id for s in new_search_results]))
        else:
            print "no updates"
Exemplo n.º 6
0
def updating_stream_plot(q, number_of_words=30):
    """This plot uses the streaming API to get real time twitter
    information from a given region, determined by a geo-coordinate
    bounding box. The upper left and lower right determine the
    bounding box.

    q is a queue instance, which holds tweets

    number_of_words determines the average number of words in the
    plot. Once the plot reaches 2 x number_of_words, it is shrunk down
    to the new set of words and starts growing again

    To exit the program early, hit CTRL + Z to stop the python script
    and then CTRL + D twice to kill the terminal process and close the
    window.

    """
    setup = False
    fdist = None
    samples = None
    draw_time = 0.1
    samples = []
    plt.ion()
    plt.grid(True, color="silver")

    for i in range(100000):
        status = q.get()
        search_results = [status]
        while not q.empty():
            print "getting another tweet"
            status = q.get()
            search_results.append(status)

        if not setup:
            print "Gathering enough data to begin plotting"
            while len(samples) < 1:
                status = q.get()
                search_results.append(status)
                filtered_words = utils.tokenize_and_filter(search_results)
                if fdist is None:
                    fdist = utils.get_freq_dist(filtered_words)
                else:
                    fdist = update_fdist(fdist, filtered_words)
                n_words = min(10, len(fdist))
                samples = [item for item, _ in fdist.most_common(n_words)]
                # print "len(samples) = {}".format(len(samples))
                samples = remove_infrequent_words(samples, fdist)
            freqs = [fdist[sample] for sample in samples]
            plt.plot(freqs, range(len(freqs)))
            plt.yticks(range(len(samples)), [s for s in samples])
            plt.ylabel("Samples")
            plt.xlabel("Counts")
            plt.title("Top Words Frequency Distribution")
            plt.show()
            plt.pause(draw_time)
            setup = True

        else:
            filtered_words = utils.tokenize_and_filter(search_results)
            fdist = update_fdist(fdist, filtered_words)
            newsamples = [item
                          for item, _ in fdist.most_common(number_of_words)]
            newsamples = remove_infrequent_words(newsamples, fdist)
            s1 = set(newsamples)
            s2 = set(samples)
            s1.difference_update(s2)
            if s1:
                print "New words: " + str(list(s1))
                newsamples = list(s1)
                samples.extend(newsamples)
                if len(samples) > 2*number_of_words:
                    samples = newsamples
                    plt.close()
                plt.yticks(range(len(samples)), [s for s in samples])
            freqs = [fdist[sample] for sample in samples]
            plt.plot(freqs, range(len(freqs)))
            plt.draw()
            plt.pause(draw_time)
    kill_plot()
    return