示例#1
0
class Results():
    """Class for aggregating and accessing search result sets and
       subsets.  Returns derived values for the query specified."""
    def __init__(self,
                 user,
                 password,
                 stream_url,
                 paged=False,
                 output_file_path=None,
                 pt_filter=None,
                 max_results=100,
                 start=None,
                 end=None,
                 count_bucket=None,
                 show_query=False,
                 hard_max=None):
        """Create a result set by passing all of the require parameters 
           for a query. The Results class runs an API query once when 
           initialized. This allows one to make multiple calls 
           to analytics methods on a single query.
        """
        # run the query
        self.query = Query(user, password, stream_url, paged, output_file_path,
                           hard_max)
        self.query.execute(pt_filter=pt_filter,
                           max_results=max_results,
                           start=start,
                           end=end,
                           count_bucket=count_bucket,
                           show_query=show_query)
        self.freq = None

    def get_raw_results(self):
        """Generator of query results"""
        for x in self.query.get_raw_results():
            yield x

    def get_activities(self):
        """Generator of query tweet results."""
        for x in self.query.get_activity_set():
            yield x

    def get_time_series(self):
        """Generator of time series for query results."""
        for x in self.query.get_time_series():
            yield x

    def get_top_links(self, n=20):
        """Returns the links most shared in the data set retrieved in
           the order of how many times each was shared."""
        self.freq = SimpleNGrams(char_upper_cutoff=100, tokenizer="space")
        for x in self.query.get_activity_set():
            for link_str in x.most_unrolled_urls:
                self.freq.add(link_str)
        return self.freq.get_tokens(n)

    def get_top_users(self, n=50):
        """Returns the users  tweeting the most in the data set retrieved
           in the data set. Users are returned in descending order of how
           many times they were tweeted."""
        self.freq = SimpleNGrams(char_upper_cutoff=20, tokenizer="twitter")
        for x in self.query.get_activity_set():
            self.freq.add(x.screen_name)
        return self.freq.get_tokens(n)

    def get_users(self, n=None):
        """Returns the user ids for the tweets collected"""
        uniq_users = set()
        for x in self.query.get_activity_set():
            uniq_users.add(x.user_id)
        return uniq_users

    def get_top_grams(self, n=20):
        self.freq = SimpleNGrams(char_upper_cutoff=20, tokenizer="twitter")
        self.freq.sl.add_session_stop_list(["http", "https", "amp", "htt"])
        for x in self.query.get_activity_set():
            self.freq.add(x.all_text)
        return self.freq.get_tokens(n)

    def get_geo(self):
        for x in self.query.get_activity_set():
            if x.geo_coordinates is not None:
                lat_lon = x.geo_coordinates
                activity = {
                    "id": x.id,
                    "postedTime": x.created_at_string.strip(".000Z"),
                    "latitude": lat_lon["latitude"],
                    "longitude": lat_lon["longitude"]
                }
                yield activity

    def get_frequency_items(self, size=20):
        """Retrieve the token list structure from the last query"""
        if self.freq is None:
            raise VallueError("No frequency available for use case")
        return self.freq.get_tokens(size)

    def __len__(self):
        return len(self.query)

    def __repr__(self):
        if self.last_query_params["count_bucket"] is None:
            res = [u"-" * OUTPUT_PAGE_WIDTH]
            rate = self.query.get_rate()
            unit = "Tweets/Minute"
            if rate < 0.01:
                rate *= 60.
                unit = "Tweets/Hour"
            res.append("     PowerTrack Rule: \"%s\"" %
                       self.last_query_params["pt_filter"])
            res.append("  Oldest Tweet (UTC): %s" % str(self.query.oldest_t))
            res.append("  Newest Tweet (UTC): %s" % str(self.query.newest_t))
            res.append(
                "           Now (UTC): %s" %
                str(datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")))
            res.append("        %5d Tweets: %6.3f %s" %
                       (self.query.res_cnt, rate, unit))
            res.append("-" * OUTPUT_PAGE_WIDTH)
            #
            self.query.get_top_users()
            fmt_str = u"%{}s -- %10s     %8s (%d)".format(BIG_COLUMN_WIDTH)
            res.append(fmt_str %
                       ("users", "tweets", "activities", self.res_cnt))
            res.append("-" * OUTPUT_PAGE_WIDTH)
            fmt_str = u"%{}s -- %4d  %5.2f%% %4d  %5.2f%%".format(
                BIG_COLUMN_WIDTH)
            for x in self.freq.get_tokens(20):
                res.append(fmt_str %
                           (x[4], x[0], x[1] * 100., x[2], x[3] * 100.))
            res.append("-" * OUTPUT_PAGE_WIDTH)
            #
            self.query.get_top_links()
            fmt_str = u"%{}s -- %10s     %8s (%d)".format(
                int(2.5 * BIG_COLUMN_WIDTH))
            res.append(fmt_str %
                       ("links", "mentions", "activities", self.res_cnt))
            res.append("-" * OUTPUT_PAGE_WIDTH)
            fmt_str = u"%{}s -- %4d  %5.2f%% %4d  %5.2f%%".format(
                int(2.5 * BIG_COLUMN_WIDTH))
            for x in self.freq.get_tokens(20):
                res.append(fmt_str %
                           (x[4], x[0], x[1] * 100., x[2], x[3] * 100.))
            res.append("-" * OUTPUT_PAGE_WIDTH)
            #
            self.query.get_top_grams()
            fmt_str = u"%{}s -- %10s     %8s (%d)".format(BIG_COLUMN_WIDTH)
            res.append(fmt_str %
                       ("terms", "mentions", "activities", self.res_cnt))
            res.append("-" * OUTPUT_PAGE_WIDTH)
            fmt_str = u"%{}s -- %4d  %5.2f%% %4d  %6.2f%%".format(
                BIG_COLUMN_WIDTH)
            for x in self.freq.get_tokens(20):
                res.append(fmt_str %
                           (x[4], x[0], x[1] * 100., x[2], x[3] * 100.))
            res.append("-" * OUTPUT_PAGE_WIDTH)
        else:
            res = [
                "{:%Y-%m-%dT%H:%M:%S},{}".format(x[2], x[1])
                for x in self.get_time_series()
            ]
        return u"\n".join(res)
class Results():
    """Class for aggregating and accessing search result sets and
       subsets.  Returns derived values for the query specified."""

    def __init__(self
            , user
            , password
            , stream_url
            , paged = False
            , output_file_path = None
            , pt_filter = None
            , max_results = 100
            , start = None
            , end = None
            , count_bucket = None
            , show_query = False
            , hard_max = None
            ):
        """Create a result set by passing all of the require parameters 
           for a query. The Results class runs an API query once when 
           initialized. This allows one to make multiple calls 
           to analytics methods on a single query.
        """
        # run the query
        self.query = Query(user, password, stream_url, paged, output_file_path, hard_max)
        self.query.execute(
            pt_filter=pt_filter
            , max_results = max_results
            , start = start
            , end = end
            , count_bucket = count_bucket
            , show_query = show_query
            )
        self.freq = None

    def get_activities(self):
        """Generator of query results."""
        for x in self.query.get_activity_set():
            yield x

    def get_time_series(self):
        """Generator of time series for query results."""
        for x in self.query.get_time_series():
            yield x

    def get_top_links(self, n=20):
        """Returns the links most shared in the data set retrieved in
           the order of how many times each was shared."""
        self.freq = SimpleNGrams(char_upper_cutoff=100, tokenizer="space")
        for x in self.query.get_list_set():
            link_str = x[LINKS_INDEX]
            if link_str != "GNIPEMPTYFIELD" and link_str != "None":
                self.freq.add(link_str)
            else:
                self.freq.add("NoLinks")
        return self.freq.get_tokens(n)

    def get_top_users(self, n=50):
        """Returns the users  tweeting the most in the data set retrieved
           in the data set. Users are returned in descending order of how
           many times they were tweeted."""
        self.freq = SimpleNGrams(char_upper_cutoff=20, tokenizer="twitter")
        for x in self.query.get_list_set():
            self.freq.add(x[USER_NAME_INDEX])
        return self.freq.get_tokens(n) 

    def get_users(self, n=None):
        """Returns the user ids for the tweets collected"""
        uniq_users = set()
        for x in self.query.get_list_set():
            uniq_users.add(x[USER_ID_INDEX])
        return uniq_users

    def get_top_grams(self, n=20):
        self.freq = SimpleNGrams(char_upper_cutoff=20, tokenizer="twitter")
        self.freq.sl.add_session_stop_list(["http", "https", "amp", "htt"])
        for x in self.query.get_list_set():
            self.freq.add(x[TEXT_INDEX])
        return self.freq.get_tokens(n) 
            
    def get_geo(self):
        for rec in self.query.get_activity_set():
            lat, lng = None, None
            if "geo" in rec:
                if "coordinates" in rec["geo"]:
                    [lat,lng] = rec["geo"]["coordinates"]
                    activity = { "id": rec["id"].split(":")[2]
                        , "postedTime": rec["postedTime"].strip(".000Z")
                        , "latitude": lat
                        , "longitude": lng }
                    yield activity
 
    def get_frequency_items(self, size = 20):
        """Retrieve the token list structure from the last query"""
        if self.freq is None:
            raise VallueError("No frequency available for use case")
        return self.freq.get_tokens(size)

    def __len__(self):
        return len(self.query)

    def __repr__(self):
        if self.last_query_params["count_bucket"] is None:
            res = [u"-"*OUTPUT_PAGE_WIDTH]
            rate = self.query.get_rate()
            unit = "Tweets/Minute"
            if rate < 0.01:
                rate *= 60.
                unit = "Tweets/Hour"
            res.append("     PowerTrack Rule: \"%s\""%self.last_query_params["pt_filter"])
            res.append("  Oldest Tweet (UTC): %s"%str(self.query.oldest_t))
            res.append("  Newest Tweet (UTC): %s"%str(self.query.newest_t))
            res.append("           Now (UTC): %s"%str(datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")))
            res.append("        %5d Tweets: %6.3f %s"%(self.query.res_cnt, rate, unit))
            res.append("-"*OUTPUT_PAGE_WIDTH)
            #
            self.query.get_top_users()
            fmt_str = u"%{}s -- %10s     %8s (%d)".format(BIG_COLUMN_WIDTH)
            res.append(fmt_str%( "users", "tweets", "activities", self.res_cnt))
            res.append("-"*OUTPUT_PAGE_WIDTH)
            fmt_str =  u"%{}s -- %4d  %5.2f%% %4d  %5.2f%%".format(BIG_COLUMN_WIDTH)
            for x in self.freq.get_tokens(20):
                res.append(fmt_str%(x[4], x[0], x[1]*100., x[2], x[3]*100.))
            res.append("-"*OUTPUT_PAGE_WIDTH)
            #
            self.query.get_top_links()
            fmt_str = u"%{}s -- %10s     %8s (%d)".format(int(2.5*BIG_COLUMN_WIDTH))
            res.append(fmt_str%( "links", "mentions", "activities", self.res_cnt))
            res.append("-"*OUTPUT_PAGE_WIDTH)
            fmt_str =  u"%{}s -- %4d  %5.2f%% %4d  %5.2f%%".format(int(2.5*BIG_COLUMN_WIDTH))
            for x in self.freq.get_tokens(20):
                res.append(fmt_str%(x[4], x[0], x[1]*100., x[2], x[3]*100.))
            res.append("-"*OUTPUT_PAGE_WIDTH)
            #
            self.query.get_top_grams()
            fmt_str = u"%{}s -- %10s     %8s (%d)".format(BIG_COLUMN_WIDTH)
            res.append(fmt_str%( "terms", "mentions", "activities", self.res_cnt))
            res.append("-"*OUTPUT_PAGE_WIDTH)
            fmt_str =u"%{}s -- %4d  %5.2f%% %4d  %6.2f%%".format(BIG_COLUMN_WIDTH)
            for x in self.freq.get_tokens(20):
                res.append(fmt_str%(x[4], x[0], x[1]*100., x[2], x[3]*100.))
            res.append("-"*OUTPUT_PAGE_WIDTH)
        else:
            res = ["{:%Y-%m-%dT%H:%M:%S},{}".format(x[2], x[1])
                        for x in self.get_time_series()]
        return u"\n".join(res)
示例#3
0
    grams_parser.add_argument("-c", "--char-limit", dest="char_limit", default=2, 
            help="The shortest grams to include in the count.")
    grams_parser.add_argument("-p", "--pretty-print", dest="pretty_print", action="store_true", default=False,
            help="Prettier output format")
    grams_parser.add_argument("-k", "--n-grams", dest="n_grams", default=2,
            help="N-gram depth (default 2)")
    grams_parser.add_argument("-f", "--filter", dest="filter", default=None,
            help="List of terms to filter \"the,and,happy\"")
    opts = grams_parser.parse_args()

    f = SimpleNGrams(charCutoff=int(opts.char_limit), n_grams=opts.n_grams)
    if opts.filter is not None:
        tmp = [x.lower().strip() for x in opts.filter.split(",")]
        f.sl.add_session_stop_list(tmp)
    for row in fileinput.FileInput(opts.file_name,openhook=fileinput.hook_encoded("utf-8")):
        f.add(row)
    if opts.number_of_grams is None:
        res = f.get_repr(opts.number_of_grams)
    else:
        res = f.get_repr(int(opts.number_of_grams))
    if opts.pretty_print:
        fmt = ["%5s", "%9s", "%5s", "%9s", "%24s", "%7s"] 
        for x in res.split('\n'):
            tmp_str = x.strip().split(",")
            sys.stdout.write(" ".join([j%i for i,j in zip(tmp_str,fmt)]) + "\n")
    else:
        sys.stdout.write(res)
    f.term_dictionary("./term_dict.pickle", co=int(opts.char_limit))
    # recover with e.g. pickle.load(open("./term_dict.pickle", "rb"))

示例#4
0
class Results():
    """Class for aggregating and accessing search result sets and
       subsets.  Returns derived values for the query specified."""
    def __init__(self,
                 user,
                 password,
                 stream_url,
                 paged=False,
                 output_file_path=None,
                 pt_filter=None,
                 max_results=100,
                 start=None,
                 end=None,
                 count_bucket=None,
                 show_query=False,
                 search_v2=False):
        """Create a result set by passing all of the require parameters 
           for a query. The Results class runs an API query once when 
           initialized. This allows one to make multiple calls 
           to analytics methods on a single query.
        """
        # run the query
        self.query = Query(user, password, stream_url, paged, output_file_path,
                           search_v2)
        self.query.execute(pt_filter=pt_filter,
                           max_results=max_results,
                           start=start,
                           end=end,
                           count_bucket=count_bucket,
                           show_query=show_query)
        self.freq = None

    def get_activities(self):
        """Generator of query results."""
        for x in self.query.get_activity_set():
            yield x

    def get_time_series(self):
        """Generator of time series for query results. If count_bucket
           is set to a valid string, then the returned values are from
           the counts endpoint. In the case of the data endpoint, the
           generator returns the createdDate for the activities retrieved."""
        for x in self.query.time_series:
            yield x

    def get_top_links(self, n=20):
        """Returns the links most shared in the data set retrieved in
           the order of how many times each was shared."""
        self.freq = SimpleNGrams(char_upper_cutoff=100, tokenizer="space")
        for x in self.query.get_list_set():
            link_str = x[LINKS_INDEX]
            if link_str != "GNIPEMPTYFIELD" and link_str != "None":
                exec("link_list=%s" % link_str)
                for l in link_list:
                    self.freq.add(l)
            else:
                self.freq.add("NoLinks")
        return self.freq.get_tokens(n)

    def get_top_users(self, n=50):
        """Returns the users  tweeting the most in the data set retrieved
           in the data set. Users are returned in descending order of how
           many times they were tweeted."""
        self.freq = SimpleNGrams(char_upper_cutoff=20, tokenizer="twitter")
        for x in self.query.get_list_set():
            self.freq.add(x[USER_NAME_INDEX])
        return self.freq.get_tokens(n)

    def get_top_grams(self, n=20):
        self.freq = SimpleNGrams(char_upper_cutoff=20, tokenizer="twitter")
        for x in self.query.get_list_set():
            self.freq.add(x[TEXT_INDEX])
        return self.freq.get_tokens(n)

    def get_geo(self):
        for rec in self.query.get_activity_set():
            lat, lng = None, None
            if "geo" in rec:
                if "coordinates" in rec["geo"]:
                    [lat, lng] = rec["geo"]["coordinates"]
                    activity = {
                        "id": rec["id"].split(":")[2],
                        "postedTime": rec["postedTime"].strip(".000Z"),
                        "latitude": lat,
                        "longitude": lng
                    }
                    yield activity

    def get_frequency_items(self, size=20):
        """Retrieve the token list structure from the last query"""
        if self.freq is None:
            raise VallueError("No frequency available for use case")
        return self.freq.get_tokens(size)

    def __len__(self):
        return len(self.query)

    def __repr__(self):
        if self.last_query_params["count_bucket"] is None:
            res = [u"-" * OUTPUT_PAGE_WIDTH]
            rate = self.query.get_rate()
            unit = "Tweets/Minute"
            if rate < 0.01:
                rate *= 60.
                unit = "Tweets/Hour"
            res.append("     PowerTrack Rule: \"%s\"" %
                       self.last_query_params["pt_filter"])
            res.append("  Oldest Tweet (UTC): %s" % str(self.query.oldest_t))
            res.append("  Newest Tweet (UTC): %s" % str(self.query.newest_t))
            res.append(
                "           Now (UTC): %s" %
                str(datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")))
            res.append("        %5d Tweets: %6.3f %s" %
                       (self.query.res_cnt, rate, unit))
            res.append("-" * OUTPUT_PAGE_WIDTH)
            #
            self.query.get_top_users()
            fmt_str = u"%{}s -- %10s     %8s (%d)".format(BIG_COLUMN_WIDTH)
            res.append(fmt_str %
                       ("users", "tweets", "activities", self.res_cnt))
            res.append("-" * OUTPUT_PAGE_WIDTH)
            fmt_str = u"%{}s -- %4d  %5.2f%% %4d  %5.2f%%".format(
                BIG_COLUMN_WIDTH)
            for x in self.freq.get_tokens(20):
                res.append(fmt_str %
                           (x[4], x[0], x[1] * 100., x[2], x[3] * 100.))
            res.append("-" * OUTPUT_PAGE_WIDTH)
            #
            self.query.get_top_links()
            fmt_str = u"%{}s -- %10s     %8s (%d)".format(
                int(2.5 * BIG_COLUMN_WIDTH))
            res.append(fmt_str %
                       ("links", "mentions", "activities", self.res_cnt))
            res.append("-" * OUTPUT_PAGE_WIDTH)
            fmt_str = u"%{}s -- %4d  %5.2f%% %4d  %5.2f%%".format(
                int(2.5 * BIG_COLUMN_WIDTH))
            for x in self.freq.get_tokens(20):
                res.append(fmt_str %
                           (x[4], x[0], x[1] * 100., x[2], x[3] * 100.))
            res.append("-" * OUTPUT_PAGE_WIDTH)
            #
            self.query.get_top_grams()
            fmt_str = u"%{}s -- %10s     %8s (%d)".format(BIG_COLUMN_WIDTH)
            res.append(fmt_str %
                       ("terms", "mentions", "activities", self.res_cnt))
            res.append("-" * OUTPUT_PAGE_WIDTH)
            fmt_str = u"%{}s -- %4d  %5.2f%% %4d  %6.2f%%".format(
                BIG_COLUMN_WIDTH)
            for x in self.freq.get_tokens(20):
                res.append(fmt_str %
                           (x[4], x[0], x[1] * 100., x[2], x[3] * 100.))
            res.append("-" * OUTPUT_PAGE_WIDTH)
        else:
            res = [
                "{:%Y-%m-%dT%H:%M:%S},{}".format(x[2], x[1])
                for x in self.time_series
            ]
        return u"\n".join(res)