示例#1
0
    def add_tweet(self, tweet):
        self.tweets_by_id[tweet['id']] = tweet
        toks = tweet['toks']
        self.model.info['big_n'] += len(toks)

        the_unigrams = set(bigrams.filtered_unigrams(toks))
        tweet['unigrams'] = the_unigrams
        for unigram in the_unigrams:
            self.model.add(unigram)
            self.index[unigram].append(tweet)

        the_bigrams = set(bigrams.filtered_bigrams(toks))
        tweet['bigrams'] = the_bigrams
        for bigram in the_bigrams:
            self.model.add(bigram)
            self.index[bigram].append(tweet)
            self.bigram_index[bigram[0], None].append(bigram)
            self.bigram_index[None, bigram[1]].append(bigram)

        tweet['trigrams'] = set(bigrams.filtered_trigrams(toks))
        for trigram in tweet['trigrams']:
            self.model.add(trigram)
            self.index[trigram].append(tweet)
示例#2
0
  def add_tweet(self, tweet):
    self.tweets_by_id[tweet['id']] = tweet
    toks = tweet['toks']
    self.model.info['big_n'] += len(toks)

    the_unigrams = set(bigrams.filtered_unigrams(toks))
    tweet['unigrams'] = the_unigrams
    for unigram in the_unigrams:
      self.model.add(unigram)
      self.index[unigram].append(tweet)

    the_bigrams = set(bigrams.filtered_bigrams(toks))
    tweet['bigrams'] = the_bigrams
    for bigram in the_bigrams:
      self.model.add(bigram)
      self.index[bigram].append(tweet)
      self.bigram_index[bigram[0], None].append(bigram)
      self.bigram_index[None, bigram[1]].append(bigram)

    tweet['trigrams'] = set(bigrams.filtered_trigrams(toks))
    for trigram in tweet['trigrams']:
      self.model.add(trigram)
      self.index[trigram].append(tweet)
示例#3
0
import sys
sys.path.insert(0, '/usr2/corpora/tweets/tweetmotif')
import twokenize, util, bigrams
util.fix_stdio()
from sane_re import *

AposFix = _R(r"( |^)(' [stm])( |$)")

for line in sys.stdin:
    parts = util.unicodify(line[:-1]).split("\t")
    text = parts[-1]
    toks = twokenize.simple_tokenize(text)
    toked = " ".join(toks)
    #print "\t".join(parts[:-1]) + "\t" + toked
    #try: AposFix.show_match(toked)
    #except: pass
    featstr = AposFix.gsub(toked,
                           lambda m: m[1] + m[2].replace(" ", "") + m[3])
    featstr = featstr.lower()
    toks = featstr.split()
    feats = [ug[0] for ug in bigrams.filtered_unigrams(toks)]
    feats += ["_".join(ng) for ng in bigrams.filtered_bigrams(toks)]

    print "\t".join(parts[:-1]) + "\t" + util.unicodify(" ".join(feats))
示例#4
0
import sys
sys.path.insert(0,'/usr2/corpora/tweets/tweetmotif')
import twokenize,util,bigrams
util.fix_stdio()
from sane_re import *

AposFix = _R(r"( |^)(' [stm])( |$)")


for line in sys.stdin:
  parts = util.unicodify(line[:-1]).split("\t")
  text = parts[-1]
  toks = twokenize.simple_tokenize(text)
  toked = " ".join(toks)
  #print "\t".join(parts[:-1]) + "\t" + toked
  #try: AposFix.show_match(toked)
  #except: pass
  featstr = AposFix.gsub(toked, lambda m: m[1]+m[2].replace(" ","")+m[3])
  featstr = featstr.lower()
  toks = featstr.split()
  feats = [ug[0] for ug in bigrams.filtered_unigrams(toks)]
  feats += ["_".join(ng) for ng in bigrams.filtered_bigrams(toks)]

  print "\t".join(parts[:-1]) + "\t" + util.unicodify(" ".join(feats))