Пример #1
0
def choose_multi_label(labels, lang_model):
    longest = util.argmax(labels, scorer=lambda ngram: len(ngram))
    if len(longest) > 3:

        best = util.argmax(bigrams.trigrams(longest),
                           lambda ng: lang_model.lidstone(ng))
        best = (best, )
    elif len(longest) == 3:
        best = longest
        best = (best, )
    elif len(longest) <= 2:
        # this is kinda shitty set of them .. would rather want all possible skip n-grams (O(N^2) of them?)
        z = [(tuple(x), ) for x in labels
             ] + bigrams.bigrams(labels) + bigrams.trigrams(labels)
        assert z
        z = [x for x in z if len(util.flatten(x)) <= 3]

        # sum is too weird
        # lexicographic ordering of the top-ranked sublabels in the multilabel
        def scorer(ngrams):
            scores = [lang_model.lidstone(ng) for ng in ngrams]
            if len(scores) < 3:
                scores += [0] * (3 - len(scores))
            scores.sort(reverse=True)
            # print "SCORE %-30s %s" % (scores, ngrams)
            return scores

        z.sort(key=scorer, reverse=True)
        # print "RANKING",z
        best = z[0]
    else:
        assert False
    return best
Пример #2
0
def _nice_tweet(tweet, q_toks, topic_ngrams):
  s = ""
  s += '<span class="text">'
  hl_spec = dict((ng, ('<span class="topic_hl">','</span>')) for ng in topic_ngrams)
  for qg in list(set(bigrams.bigrams(q_toks))) + list(set(bigrams.unigrams(q_toks))):
    if len(qg)==1 and qg[0] in bigrams.super_stopwords: continue
    if len(qg)==1 and any(qg[0] in ng for ng in topic_ngrams): continue
    if len(qg)>=2 and any(kmp.isSubseq(qg, ng) for ng in topic_ngrams): continue
    hl_spec[qg] = ('<span class="q_hl">','</span>')
  text = highlighter.highlight(tweet['toks'], hl_spec)
  text = linkify(text, klass='t')
  #text = twokenize.Url_RE.subn(r'<a class=t target=_blank href="\1">\1</a>', text)[0]
  #text = twokenize.AT_RE.subn(r'<a class=at target=_blank href="\1">\1</a>
  text = At.gsub(text, r'<a class="at" target="_blank" href="http://twitter.com/\2">@\2</a>')
  s += text
  s += "</span>"
  
  s += " &nbsp; "
  s += '<span class="authors">'
  if 'orig_tweets' in tweet:
    s += "%d authors:" % len(tweet['orig_tweets'])
    subtweets = tweet['orig_tweets']
  else:
    subtweets = (tweet,)
  for subtweet in subtweets:
    user = subtweet['from_user']
    link = "http://twitter.com/%s/status/%s" % (user, subtweet['id'])
    s += " "
    # calling encode() here makes NO SENSE AT ALL why do we need it?
    s += '<a class="m" target="_blank" href="%s">%s</a>' % (util.stringify(link), util.stringify(user))
  s += '</span>'
  return s
Пример #3
0
def choose_multi_label(labels, lang_model):
  longest = util.argmax(labels, scorer=lambda ngram: len(ngram))
  if len(longest) > 3:
    
    best = util.argmax(bigrams.trigrams(longest), lambda ng: lang_model.lidstone(ng))
    best = (best,)
  elif len(longest) == 3:
    best = longest
    best = (best,)
  elif len(longest) <= 2:
    # this is kinda shitty set of them .. would rather want all possible skip n-grams (O(N^2) of them?)
    z = [(tuple(x),) for x in labels] + bigrams.bigrams(labels) + bigrams.trigrams(labels)
    assert z
    z = [x for x in z if len(util.flatten(x)) <= 3]
    # sum is too weird
    # lexicographic ordering of the top-ranked sublabels in the multilabel
    def scorer(ngrams):
      scores = [lang_model.lidstone(ng) for ng in ngrams]
      if len(scores) < 3:
        scores += [0]*(3 - len(scores))
      scores.sort(reverse=True)
      # print "SCORE %-30s %s" % (scores, ngrams)
      return scores
    z.sort(key= scorer, reverse=True)
    # print "RANKING",z
    best = z[0]
  else:
    assert False
  return best
Пример #4
0
def _nice_tweet(tweet, q_toks, topic_ngrams):
    s = ""
    s += '<span class="text">'
    hl_spec = dict(
        (ng, ('<span class="topic_hl">', '</span>')) for ng in topic_ngrams)
    for qg in list(set(bigrams.bigrams(q_toks))) + list(
            set(bigrams.unigrams(q_toks))):
        if len(qg) == 1 and qg[0] in bigrams.super_stopwords: continue
        if len(qg) == 1 and any(qg[0] in ng for ng in topic_ngrams): continue
        if len(qg) >= 2 and any(kmp.isSubseq(qg, ng) for ng in topic_ngrams):
            continue
        hl_spec[qg] = ('<span class="q_hl">', '</span>')
    text = highlighter.highlight(tweet['toks'], hl_spec)
    text = linkify(text, klass='t')
    #text = twokenize.Url_RE.subn(r'<a class=t target=_blank href="\1">\1</a>', text)[0]
    #text = twokenize.AT_RE.subn(r'<a class=at target=_blank href="\1">\1</a>
    text = At.gsub(
        text,
        r'<a class="at" target="_blank" href="http://twitter.com/\2">@\2</a>')
    s += text
    s += "</span>"

    s += " &nbsp; "
    s += '<span class="authors">'
    if 'orig_tweets' in tweet:
        s += "%d authors:" % len(tweet['orig_tweets'])
        subtweets = tweet['orig_tweets']
    else:
        subtweets = (tweet, )
    for subtweet in subtweets:
        user = subtweet['from_user']
        link = "http://twitter.com/%s/status/%s" % (user, subtweet['id'])
        s += " "
        # calling encode() here makes NO SENSE AT ALL why do we need it?
        s += '<a class="m" target="_blank" href="%s">%s</a>' % (
            util.stringify(link), util.stringify(user))
    s += '</span>'
    return s