Пример #1
0
        tweet['trigrams'] = set(bigrams.filtered_trigrams(toks))
        for trigram in tweet['trigrams']:
            self.model.add(trigram)
            self.index[trigram].append(tweet)
        #self.tweets_by_text.append(tweet)
        #for ngram in set(bigrams.multi_ngrams(toks, n_and_up=3)):
        #  pass

    def fill_from_tweet_iter(self, tweet_iter):
        for tweet in tweet_iter:
            self.add_tweet(tweet)


if __name__ == '__main__':
    import cPickle as pickle
    import search
    q = sys.argv[1]
    smoothing = sys.argv[2]
    bg_model = lang_model.TokyoLM(readonly=True)
    lc = LinkedCorpus()
    tweet_iter = search.cleaned_results(q,
                                        pages=2,
                                        key_fn=search.user_and_text_identity,
                                        save=None,
                                        load=None)
    lc.fill_from_tweet_iter(tweet_iter)
    for ratio, ngram in lc.model.compare_with_bg_model(
            bg_model, 3, min_count=3, smoothing_algorithm=smoothing):
        print "%s\t%s" % ('_'.join(ngram), ratio)
Пример #2
0
      self.bigram_index[None, bigram[1]].append(bigram)

    tweet['trigrams'] = set(bigrams.filtered_trigrams(toks))
    for trigram in tweet['trigrams']:
      self.model.add(trigram)
      self.index[trigram].append(tweet)
    #self.tweets_by_text.append(tweet)
    #for ngram in set(bigrams.multi_ngrams(toks, n_and_up=3)):
    #  pass

  def fill_from_tweet_iter(self, tweet_iter):
    for tweet in tweet_iter:
      self.add_tweet(tweet)

if __name__=='__main__':
  import cPickle as pickle
  import search
  q = sys.argv[1]
  smoothing = sys.argv[2]
  bg_model = lang_model.TokyoLM(readonly=True)
  lc = LinkedCorpus()
  tweet_iter = search.cleaned_results(q,
      pages = 2, 
      key_fn = search.user_and_text_identity, 
      save = None,
      load = None
      )
  lc.fill_from_tweet_iter(tweet_iter)
  for ratio, ngram in lc.model.compare_with_bg_model(bg_model, 3, min_count=3, smoothing_algorithm=smoothing):
    print "%s\t%s" % ('_'.join(ngram), ratio)
Пример #3
0
def the_app(environ, start_response):
    global_init()
    status = '200 OK'

    opts = Opts(
        environ,
        opt('q', default=''),
        opt('pages', default=2),
        opt('split', default=0),
        opt('simple', default=0),
        opt('max_topics', default=40),
        opt('ncol', default=3),
        opt('save', default=False),
        opt('load', default=False),
        opt('smoothing', default='lidstone'),
        opt('single_query', default=0),
        opt('format', default='dev'),
    )

    print "OPTIONS %s" % (opts, )

    response_headers = [('Content-type', 'text/html')]
    start_response(status, response_headers)

    if opts.single_query:
        # the requery
        opts2 = Opts(environ, opt('q', str), opt('topic_label', str),
                     opt('exclude', default=''))
        opts2.exclude = [int(x) for x in opts2.exclude.split()]
        for x in single_query(**opts2):
            yield x
        return

    lc = linkedcorpus.LinkedCorpus()
    tweets_file = 'saved_tweets/save_%s_tweets' % opts.q
    tweet_iter = search.cleaned_results(
        opts.q,
        pages=opts.pages,
        key_fn=search.user_and_text_identity,
        save=tweets_file if opts.save else None,
        load=tweets_file if opts.load else None)
    tweet_iter = deduper.merge_multitweets(tweet_iter)
    lc.fill_from_tweet_iter(tweet_iter)
    q_toks = bigrams.tokenize_and_clean(opts.q, True)
    res = ranking.extract_topics(lc, background_model, **opts)
    groups_by_tweet_id = deduper.dedupe(lc)
    for topic in res.topics:
        deduper.groupify_topic(topic, groups_by_tweet_id)
    ranking.late_topic_clean(res, max_topics=opts.max_topics)
    ranking.gather_leftover_tweets(res, lc)
    if res.topics and res.topics[-1].groups is None:
        deduper.groupify_topic(res.topics[-1], groups_by_tweet_id)
    for topic in res.topics:
        topic.tweet_ids = util.myjoin([tw['id'] for tw in topic.tweets])
        for group in topic.groups:
            group.head_html = nice_tweet(group.head, q_toks,
                                         topic.label_ngrams)
            group.rest_htmls = [
                nice_tweet(t, q_toks, topic.label_ngrams) for t in group.rest
            ]
    for topic in res.topics:
        topic.groups.sort(key=lambda g: g.head['created_at'], reverse=True)
    if lc.tweets_by_id:
        earliest = min(tw['created_at'] for tw in lc.tweets_by_id.itervalues())
        time_since_earliest = nice_timedelta(datetime.utcnow() - earliest)
    else:
        time_since_earliest = None

    if opts.format == 'pickle':
        # pickle.dumps(res) is 800k with dump/load = 100ms/60ms
        # trimmed json-like version is 150k with dump/load = 5ms/2ms.
        yield pickle.dumps(res)
        return
    if opts.format == 'json':
        topic_info = dict((t.label, {
            'label':
            t.label,
            'nice_label':
            nice_label(t.label),
            'tweet_ids':
            t.tweet_ids,
            'groups': [{
                'head_html': g.head_html,
                'rest_htmls': g.rest_htmls
            } for g in t.groups],
            'query_refinement':
            ranking.query_refinement(opts.q, t),
        }) for t in res.topics)
        topic_list = [t.label for t in res.topics]
        results = {
            'topic_list': topic_list,
            'topic_info': topic_info,
            'time_since_earliest': time_since_earliest,
        }
        yield simplejson.dumps(results)
        return
    if opts.format != 'dev': raise Exception("bad format")

    for topic in res.topics:
        topic.tweets_html = topic_group_html(topic.groups)
    bigass_topic_dict = dict((t.label,
                              dict(
                                  label=t.label,
                                  tweets_html=t.tweets_html,
                                  tweet_ids=t.tweet_ids,
                              )) for t in res.topics)

    yield page_header()
    yield form_area(opts)
    yield "<table><tr>"
    yield "<th>topics"
    if lc.tweets_by_id:
        earliest = min(tw['created_at'] for tw in lc.tweets_by_id.itervalues())
        #latest   = max(tw['created_at'] for tw in lc.tweets_by_id.itervalues())
        s = "for %d tweets" % len(lc.tweets_by_id)
        s += " over the last %s" % nice_timedelta(datetime.utcnow() - earliest)
        yield " <small>%s</small>" % s

    yield "<th>tweets"
    yield "<tr><td valign=top id=topic_list>"

    topic_labels = [
        '''<span class="topic_label" onclick="topic_click(this)" topic_label="%s"
  >%s</span><small>&nbsp;%d,&thinsp;%d</small><br>''' % (cgi.escape(
            topic.label), topic.label, topic.group_count, topic.tweet_count)
        for topic in res.topics
    ]
    for x in table_byrow(topic_labels, ncol=opts.ncol):
        yield x

    yield "<td valign=top>"
    yield "<div id=tweets>"
    yield "click on a topic on the left please"
    yield "</div>"
    yield "<div id=tweets_more>"
    yield "</div>"
    yield "</table>"
    yield "<script>"

    yield "topics = "
    yield simplejson.dumps(bigass_topic_dict)
    yield ";"
    yield "load_default_topic();"
    yield "</script>"
Пример #4
0
def the_app(environ, start_response):
  global_init()
  status = '200 OK'

  opts = Opts(environ,
      opt('q', default=''),
      opt('pages', default=2),
      opt('split', default=0),
      opt('simple', default=0),
      opt('max_topics', default=40),
      opt('ncol', default=3),
      opt('save', default=False),
      opt('load', default=False),
      opt('smoothing', default='lidstone'),
      opt('single_query', default=0),
      opt('format', default='dev'),
      )

  print "OPTIONS %s" % (opts,)

  response_headers = [('Content-type','text/html')]
  start_response(status, response_headers)

  if opts.single_query:
    # the requery
    opts2 = Opts(environ, opt('q',str), opt('topic_label',str), opt('exclude',default=''))
    opts2.exclude = [int(x) for x in opts2.exclude.split()]
    for x in single_query(**opts2):
      yield x
    return

  lc = linkedcorpus.LinkedCorpus()
  tweets_file = 'saved_tweets/save_%s_tweets' % opts.q
  tweet_iter = search.cleaned_results(opts.q, 
      pages = opts.pages, 
      key_fn = search.user_and_text_identity, 
      save = tweets_file if opts.save else None,
      load = tweets_file if opts.load else None
  )
  tweet_iter = deduper.merge_multitweets(tweet_iter)
  lc.fill_from_tweet_iter(tweet_iter)
  q_toks = bigrams.tokenize_and_clean(opts.q, True)
  res = ranking.extract_topics(lc, background_model, **opts)
  groups_by_tweet_id = deduper.dedupe(lc)
  for topic in res.topics:
    deduper.groupify_topic(topic, groups_by_tweet_id)
  ranking.late_topic_clean(res, max_topics=opts.max_topics)
  ranking.gather_leftover_tweets(res, lc)
  if res.topics and res.topics[-1].groups is None:
    deduper.groupify_topic(res.topics[-1], groups_by_tweet_id)  
  for topic in res.topics:
    topic.tweet_ids = util.myjoin([tw['id'] for tw in topic.tweets])
    for group in topic.groups:
      group.head_html = nice_tweet(group.head, q_toks, topic.label_ngrams)
      group.rest_htmls = [nice_tweet(t,q_toks,topic.label_ngrams) for t in group.rest]
  for topic in res.topics:
    topic.groups.sort(key=lambda g: g.head['created_at'], reverse=True)
  if lc.tweets_by_id:
    earliest = min(tw['created_at'] for tw in lc.tweets_by_id.itervalues())
    time_since_earliest = nice_timedelta(datetime.utcnow() - earliest)
  else:
    time_since_earliest = None
  
  if opts.format == 'pickle':
    # pickle.dumps(res) is 800k with dump/load = 100ms/60ms
    # trimmed json-like version is 150k with dump/load = 5ms/2ms.
    yield pickle.dumps(res)
    return
  if opts.format == 'json':
    topic_info = dict( (t.label,
       {
         'label': t.label,
         'nice_label': nice_label(t.label),
         'tweet_ids': t.tweet_ids,
         'groups': [{'head_html':g.head_html, 'rest_htmls':g.rest_htmls} for g in t.groups],
         'query_refinement': ranking.query_refinement(opts.q, t),
       })
        for t in res.topics)
    topic_list = [t.label for t in res.topics]
    results = {'topic_list':topic_list, 'topic_info': topic_info, 'time_since_earliest': time_since_earliest,}
    yield simplejson.dumps(results)
    return
  if opts.format != 'dev': raise Exception("bad format")
  
  for topic in res.topics:
    topic.tweets_html = topic_group_html(topic.groups)
  bigass_topic_dict = dict((t.label, dict(
    label= t.label, 
    tweets_html= t.tweets_html, 
    tweet_ids= t.tweet_ids,
  )) for t in res.topics)

  yield page_header()
  yield form_area(opts)  
  yield "<table><tr>"
  yield "<th>topics"
  if lc.tweets_by_id:
    earliest = min(tw['created_at'] for tw in lc.tweets_by_id.itervalues())
    #latest   = max(tw['created_at'] for tw in lc.tweets_by_id.itervalues())
    s=  "for %d tweets" % len(lc.tweets_by_id)
    s+= " over the last %s" % nice_timedelta(datetime.utcnow() - earliest)
    yield " <small>%s</small>" % s

  yield "<th>tweets"
  yield "<tr><td valign=top id=topic_list>"
  
  topic_labels = ['''<span class="topic_label" onclick="topic_click(this)" topic_label="%s"
  >%s</span><small>&nbsp;%d,&thinsp;%d</small><br>''' % (
    cgi.escape(topic.label), topic.label, topic.group_count, topic.tweet_count )
                  for topic in res.topics]
  for x in table_byrow(topic_labels, ncol=opts.ncol): yield x

  yield "<td valign=top>"
  yield "<div id=tweets>"
  yield "click on a topic on the left please"
  yield "</div>"
  yield "<div id=tweets_more>"
  yield "</div>"
  yield "</table>"
  yield "<script>"

  yield "topics = "
  yield simplejson.dumps(bigass_topic_dict)
  yield ";"
  yield "load_default_topic();"
  yield "</script>"