Пример #1
0
def setup_analysis(do_conversation=False,
                   do_audience=False,
                   identifier=None,
                   input_results=None):
    """
    Created placeholders for quantities of interest in results structure;
    return results data structure.

    If an identifier is specified, place the measurement accumulators at a
    particular key.

    """
    def weight_and_screennames():
        return {"weight": 0, "screennames": set([])}

    results = {
        "tweet_count": 0,
        "non-tweet_lines": 0,
        "tweets_per_user": defaultdict(int),
        #"user_id_to_screenname":
    }
    if do_conversation:
        results["do_conversation"] = True
        results["body_term_count"] = SimpleNGrams(char_lower_cutoff=3,
                                                  n_grams=3,
                                                  tokenizer="twitter")
        results["hashtags"] = defaultdict(int)
        results["urls"] = defaultdict(int)
        results["number_of_links"] = 0
        results["utc_timeline"] = defaultdict(int)
        results["local_timeline"] = defaultdict(int)
        results["at_mentions"] = defaultdict(weight_and_screennames)
        results["in_reply_to"] = defaultdict(int)
        results["RT_of_user"] = defaultdict(weight_and_screennames)
        results["quote_of_user"] = defaultdict(weight_and_screennames)
        results["url_content"] = SimpleNGrams(char_lower_cutoff=3,
                                              n_grams=3,
                                              tokenizer="twitter")
    else:
        results["do_conversation"] = False
    if do_audience:
        results["do_audience"] = True
        results["bio_term_count"] = SimpleNGrams(char_lower_cutoff=3,
                                                 n_grams=1,
                                                 tokenizer="twitter")
        results["profile_locations_regions"] = defaultdict(int)
        results["audience_api"] = ""
    else:
        results["do_audience"] = False

    # in the future we could add custom fields by adding kwarg = func where func is agg/extractor and kwarg is field name

    return results
Пример #2
0
    def set_index(self, use_case, count_bucket):
        self.use_case = use_case
        space_tokenizer = False
        char_upper_cutoff = 20  # longer than for normal words because of user names
        if use_case.startswith("links"):
            char_upper_cutoff = 100
            space_tokenizer = True
#         self.freq = SimpleNGrams(charUpperCutoff=char_upper_cutoff, space_tokenizer=space_tokenizer)
        self.freq = SimpleNGrams(char_upper_cutoff=char_upper_cutoff,
                                 tokenizer="space")
        if use_case.startswith("user"):
            self.index = USER_NAME_INDEX
        elif use_case.startswith("wordc"):
            self.index = TEXT_INDEX
        elif use_case.startswith("rate"):
            self.index = DATE_INDEX
        elif use_case.startswith("link"):
            self.index = LINKS_INDEX
        elif use_case.startswith("time"):
            if not self.stream_url.endswith("counts.json"):
                self.stream_url = self.stream_url[:-5] + "/counts.json"
            if count_bucket not in ['day', 'minute', 'hour']:
                print >> sys.stderr, "Error. Invalid count bucket: %s \n" % str(
                    count_bucket)
                sys.exit()
Пример #3
0
 def get_top_users(self, n=50):
     """Returns the users  tweeting the most in the data set retrieved
        in the data set. Users are returned in descending order of how
        many times they were tweeted."""
     self.freq = SimpleNGrams(char_upper_cutoff=20, tokenizer="twitter")
     for x in self.query.get_activity_set():
         self.freq.add(x.screen_name)
     return self.freq.get_tokens(n)
Пример #4
0
 def get_top_links(self, n=20):
     """Returns the links most shared in the data set retrieved in
        the order of how many times each was shared."""
     self.freq = SimpleNGrams(char_upper_cutoff=100, tokenizer="space")
     for x in self.query.get_activity_set():
         for link_str in x.most_unrolled_urls:
             self.freq.add(link_str)
     return self.freq.get_tokens(n)
Пример #5
0
 def get_top_links(self, n=20):
     """Returns the links most shared in the data set retrieved in
        the order of how many times each was shared."""
     self.freq = SimpleNGrams(char_upper_cutoff=100, tokenizer="space")
     for x in self.query.get_list_set():
         link_str = x[LINKS_INDEX]
         if link_str != "GNIPEMPTYFIELD" and link_str != "None":
             self.freq.add(link_str)
         else:
             self.freq.add("NoLinks")
     return self.freq.get_tokens(n)
Пример #6
0
 def get_top_grams(self, n=20):
     self.freq = SimpleNGrams(char_upper_cutoff=20, tokenizer="twitter")
     self.freq.sl.add_session_stop_list(["http", "https", "amp", "htt"])
     for x in self.query.get_activity_set():
         self.freq.add(x.all_text)
     return self.freq.get_tokens(n)
Пример #7
0
            description="See list of 1 and 2 grams (bag-of-words) for input corpus--1 docudment per line.")
    grams_parser.add_argument("file_name", metavar= "file_name", nargs="?", default=[], 
            help="Input file name (optional).")
    grams_parser.add_argument("-n", "--number-of-grams", dest="number_of_grams", default=None,
            help="Limit list to top n 1-grams and top n 2-grams.")
    grams_parser.add_argument("-c", "--char-limit", dest="char_limit", default=2, 
            help="The shortest grams to include in the count.")
    grams_parser.add_argument("-p", "--pretty-print", dest="pretty_print", action="store_true", default=False,
            help="Prettier output format")
    grams_parser.add_argument("-k", "--n-grams", dest="n_grams", default=2,
            help="N-gram depth (default 2)")
    grams_parser.add_argument("-f", "--filter", dest="filter", default=None,
            help="List of terms to filter \"the,and,happy\"")
    opts = grams_parser.parse_args()

    f = SimpleNGrams(charCutoff=int(opts.char_limit), n_grams=opts.n_grams)
    if opts.filter is not None:
        tmp = [x.lower().strip() for x in opts.filter.split(",")]
        f.sl.add_session_stop_list(tmp)
    for row in fileinput.FileInput(opts.file_name,openhook=fileinput.hook_encoded("utf-8")):
        f.add(row)
    if opts.number_of_grams is None:
        res = f.get_repr(opts.number_of_grams)
    else:
        res = f.get_repr(int(opts.number_of_grams))
    if opts.pretty_print:
        fmt = ["%5s", "%9s", "%5s", "%9s", "%24s", "%7s"] 
        for x in res.split('\n'):
            tmp_str = x.strip().split(",")
            sys.stdout.write(" ".join([j%i for i,j in zip(tmp_str,fmt)]) + "\n")
    else:
Пример #8
0
            help="Input file name (optional).")
    grams_parser.add_argument("-n", "--number-of-grams", dest="number_of_grams", default=None,
            help="Limit list to top n 1-grams and top n 2-grams.")
    grams_parser.add_argument("-c", "--char-limit", dest="char_limit", default=2, 
            help="The shortest grams to include in the count.")
    grams_parser.add_argument("-p", "--pretty-print", dest="pretty_print", action="store_true", default=False,
            help="Prettier output format")
    grams_parser.add_argument("-k", "--n-grams", dest="n_grams", default=2,
            help="N-gram depth (default 2)")
    grams_parser.add_argument("-t", "--space-tokenizer", dest="space_tokenizer", default=False, action="store_true",
            help="Use alternate tokization on white-space only.")
    grams_parser.add_argument("-f", "--filter", dest="filter", default=None,
            help="List of terms to filter \"the,and,happy\"")
    opts = grams_parser.parse_args()

    f = SimpleNGrams(charCutoff=int(opts.char_limit), n_grams=opts.n_grams, space_tokenizer=opts.space_tokenizer)
    if opts.filter is not None:
        tmp = [x.lower().strip() for x in opts.filter.split(",")]
        f.sl.add_session_stop_list(tmp)
    for row in fileinput.FileInput(opts.file_name,openhook=fileinput.hook_encoded("utf-8")):
        f.add(row)
    if opts.number_of_grams is None:
        res = f.get_repr(opts.number_of_grams)
    else:
        res = f.get_repr(int(opts.number_of_grams))
    if opts.pretty_print:
        fmt = ["%5s", "%9s", "%5s", "%9s", "%24s", "%7s"] 
        for x in res.split('\n'):
            tmp_str = x.strip().split(",")
            sys.stdout.write(" ".join([j%i for i,j in zip(tmp_str,fmt)]) + "\n")
    else:
Пример #9
0
 def get_top_grams(self, n=20):
     self.freq = SimpleNGrams(char_upper_cutoff=20, tokenizer="twitter")
     for x in self.query.get_list_set():
         self.freq.add(x[TEXT_INDEX])
     return self.freq.get_tokens(n)
Пример #10
0
 def __init__(self, token_list_size=20):
     self.token_list_size = int(token_list_size)
     twitter_parser = argparse.ArgumentParser(
         description="GnipSearch supports the following use cases: %s" %
         str(self.USE_CASES))
     twitter_parser.add_argument("use_case",
                                 metavar="USE_CASE",
                                 choices=self.USE_CASES,
                                 help="Use case for this search.")
     twitter_parser.add_argument(
         "-f",
         "--filter",
         dest="filter",
         default="from:jrmontag OR from:gnip",
         help=
         "PowerTrack filter rule (See: http://support.gnip.com/customer/portal/articles/901152-powertrack-operators)"
     )
     twitter_parser.add_argument(
         "-l",
         "--stream-url",
         dest="stream_url",
         default=
         "https://search.gnip.com/accounts/shendrickson/search/wayback.json",
         help="Url of search endpoint. (See your Gnip console.)")
     twitter_parser.add_argument(
         "-c",
         "--count",
         dest="csv_count",
         action="store_true",
         default=False,
         help=
         "Return comma-separated 'date,counts' when using a counts.json endpoint."
     )
     twitter_parser.add_argument(
         "-b",
         "--bucket",
         dest="count_bucket",
         default="day",
         help=
         "Bucket size for counts query. Options are day, hour, minute (default is 'day')."
     )
     twitter_parser.add_argument(
         "-s",
         "--start-date",
         dest="start",
         default=None,
         help=
         "Start of datetime window, format 'YYYY-mm-DDTHH:MM' (default: 30 days ago)"
     )
     twitter_parser.add_argument(
         "-e",
         "--end-date",
         dest="end",
         default=None,
         help=
         "End of datetime window, format 'YYYY-mm-DDTHH:MM' [Omit for most recent activities] (default: none)"
     )
     twitter_parser.add_argument("-q",
                                 "--query",
                                 dest="query",
                                 action="store_true",
                                 default=False,
                                 help="View API query (no data)")
     twitter_parser.add_argument("-u",
                                 "--user-name",
                                 dest="user",
                                 default="*****@*****.**",
                                 help="User name")
     twitter_parser.add_argument("-p",
                                 "--password",
                                 dest="pwd",
                                 help="Password")
     twitter_parser.add_argument(
         "-n",
         "--results-max",
         dest="max",
         default=100,
         help="Maximum results to return (default 100)")
     self.options = twitter_parser.parse_args()
     self.twitter_parser = TwacsCSV(",", False, False, True, False, True,
                                    False, False, False)
     DATE_INDEX = 1
     TEXT_INDEX = 2
     LINKS_INDEX = 3
     USER_NAME_INDEX = 7
     space_tokenizer = False
     char_upper_cutoff = 11
     #
     if self.options.use_case.startswith("links"):
         char_upper_cutoff = 100
         space_tokenizer = True
     self.freq = SimpleNGrams(charUpperCutoff=char_upper_cutoff,
                              space_tokenizer=space_tokenizer)
     if self.options.use_case.startswith("user"):
         self.index = USER_NAME_INDEX
     elif self.options.use_case.startswith("wordc"):
         self.index = TEXT_INDEX
     elif self.options.use_case.startswith("rate"):
         self.index = DATE_INDEX
     elif self.options.use_case.startswith("link"):
         self.index = LINKS_INDEX
     elif self.options.use_case.startswith("time"):
         if not self.options.stream_url.endswith("counts.json"):
             self.options.stream_url = self.options.stream_url[:-5] + "/counts.json"
         if self.options.count_bucket not in ['day', 'minute', 'hour']:
             print >> sys.stderr, "Error. Invalid count bucket: %s \n" % str(
                 self.options.count_bucket)
             sys.exit()
     timeRE = re.compile(
         "([0-9]{4}).([0-9]{2}).([0-9]{2}).([0-9]{2}):([0-9]{2})")
     if self.options.start:
         dt = re.search(timeRE, self.options.start)
         if not dt:
             print >> sys.stderr, "Error. Invalid start-date format: %s \n" % str(
                 self.options.start)
             sys.exit()
         else:
             f = ''
             for i in range(re.compile(timeRE).groups):
                 f += dt.group(i + 1)
             self.fromDate = f
     if self.options.end:
         dt = re.search(timeRE, self.options.end)
         if not dt:
             print >> sys.stderr, "Error. Invalid end-date format: %s \n" % str(
                 self.options.end)
             sys.exit()
         else:
             e = ''
             for i in range(re.compile(timeRE).groups):
                 e += dt.group(i + 1)
             self.toDate = e
Пример #11
0
    grams_parser.add_argument("-t", "--space-tokenizer", dest="space_tokenizer", default=False, action="store_true",
            help="Use alternate tokization on white-space only.")
    grams_parser.add_argument("-w", "--twitter-tokenizer", dest="twitter_tokenizer", default=False, action="store_true",
            help="Use alternate Twitter tokization with hashtags and mentions intact.")
    grams_parser.add_argument("-f", "--filter", dest="filter", default=None,
            help="List of terms to filter \"the,and,happy\"")
    opts = grams_parser.parse_args()

    if opts.space_tokenizer:
        tokenizer = "space"
    elif opts.twitter_tokenizer:
        tokenizer = "twitter"
    else:
        tokenizer = "word"

    f = SimpleNGrams(char_lower_cutoff=int(opts.char_limit), n_grams=opts.n_grams, tokenizer=tokenizer)
    if opts.filter is not None:
        tmp = [x.lower().strip() for x in opts.filter.split(",")]
        f.sl.add_session_stop_list(tmp)
    for row in fileinput.FileInput(opts.file_name,openhook=fileinput.hook_encoded("utf-8")):
        f.add(row)
    if opts.number_of_grams is None:
        res = f.get_repr(opts.number_of_grams)
    else:
        res = f.get_repr(int(opts.number_of_grams))
    if opts.pretty_print:
        fmt = ["%5s", "%9s", "%5s", "%9s", "%34s", "%7s"] 
        for x in res.split('\n'):
            tmp_str = x.strip().split(",")
            sys.stdout.write(" ".join([j%i for i,j in zip(tmp_str,fmt)]) + "\n")
    else: