Exemplo n.º 1
0
    if tweet['id'] in tok_cache:
        #print "CACHE HIT    %s" % tweet['text']
        toks = pickle.loads(tok_cache[tweet['id']])
    else:
        #print "NEW ANALYSIS %s" % tweet['text']
        toks = tokenize_and_clean(tweet['text'], alignments=True)
        tok_cache[tweet['id']] = pickle.dumps(toks)
    tweet['toks'] = toks


from twokenize import regex_or
mycompile = lambda pat: re.compile(pat, re.UNICODE)
# junk tokens are a more aggressive cleaning assumption than usual.
JunkTok = mycompile(r'''^[^a-zA-Z0-9_@]+$''')
# dont make n-grams across phrase boundary markers.
PhraseBoundaryTok = regex_or(r'''[.,“"'?!:;|-]+''', twokenize.Entity)
PhraseBoundaryTok = mycompile('^' + PhraseBoundaryTok + '$')
EdgePunctTok = mycompile('^' + twokenize.EdgePunct + '+$')


def tokenize_and_clean(msg, alignments):
    if alignments:
        toks = twokenize.tokenize(msg)
    else:
        toks = twokenize.simple_tokenize(msg)
    for i in range(len(toks)):
        toks[i] = toks[i].lower()
    inds = range(len(toks))
    #if len(inds) < len(toks): print "dropping junk", sorted(list(toks[i] for i in (set(range(len(toks)))-set(inds))))
    if alignments:
        return toks.subset(inds)
Exemplo n.º 2
0
  
  if tweet['id'] in tok_cache:
    #print "CACHE HIT    %s" % tweet['text']
    toks = pickle.loads(tok_cache[tweet['id']])
  else:
    #print "NEW ANALYSIS %s" % tweet['text']
    toks = tokenize_and_clean(tweet['text'], alignments=True)
    tok_cache[tweet['id']] = pickle.dumps(toks)
  tweet['toks'] = toks

from twokenize import regex_or
mycompile = lambda pat:  re.compile(pat,  re.UNICODE)
# junk tokens are a more aggressive cleaning assumption than usual.
JunkTok = mycompile(r'''^[^a-zA-Z0-9_@]+$''')
# dont make n-grams across phrase boundary markers.
PhraseBoundaryTok = regex_or(r'''[.,“"'?!:;|-]+''', twokenize.Entity)
PhraseBoundaryTok = mycompile('^'+PhraseBoundaryTok+'$')
EdgePunctTok = mycompile('^' + twokenize.EdgePunct + '+$')

def tokenize_and_clean(msg, alignments):
  if alignments: 
    toks = twokenize.tokenize(msg)
  else:          
    toks = twokenize.simple_tokenize(msg)
  for i in range(len(toks)):
    toks[i] = toks[i].lower()
  inds = range(len(toks))
  #if len(inds) < len(toks): print "dropping junk", sorted(list(toks[i] for i in (set(range(len(toks)))-set(inds))))
  if alignments: 
    return toks.subset(inds)
  else:
Exemplo n.º 3
0
  new_word_counts = defaultdict(int)
  new_doc_counts = defaultdict(int)
  
  for w in vocab:
    r = replacements.get(w,w)
    new_vocab.add(r)
    new_word_counts[r] += word_counts[w]
    new_doc_counts[r] += doc_counts[w]
  return new_vocab, new_word_counts, new_doc_counts, replacements
  
  

cur_user_words = []
last_username = None

Punct = twokenize.regex_or(twokenize.PunctChars, twokenize.Entity, twokenize.EdgePunct, r'[\*]')
Punct_RE = re.compile('^(%s)+$' % Punct, re.I|re.U)

def get_tokens(text):
  toks = twokenize.tokenize(text.lower())
  toks = [t for t in toks if not t.startswith('@') and t != 'rt' ]
  toks = ["-PUNCT-" if Punct_RE.search(t) and not emoticons.Emoticon_RE.search(t) else t for t in toks]
  # toks = [t.replace("#","") for t in toks]
  return toks

# for line in sys.stdin:
#   # print get_tokens(line)
#   print (" ".join(get_tokens(line))).encode('utf-8')
#   # print "\n" + "\n".join(get_tokens(line))
# sys.exit(0)
Exemplo n.º 4
0
import re

# define our own slightly more limited version of the emoticon regex
# adapted from twokenize.py

# things we need to deal with separately
our_bfLeft = u"(0|[oO]|[vV]|\\$|[tT]|[xX]|;|@|\\^|\\*)".encode('utf-8')
our_basicface = "(?:" +our_bfLeft+tw.bfCenter+ ")|" +tw.s3+ "|" +tw.s4+ "|" +tw.s5

pattern = tw.regex_or(
    # myleott: Standard version  :) :( :] :D :P
    "(?:>|)?" + tw.regex_or(tw.normalEyes, tw.wink) + tw.regex_or(tw.noseArea,"[Oo]") + tw.regex_or(tw.tongue+r"(?=\W|$|RT|rt|Rt)", tw.otherMouths+r"(?=\W|$|RT|rt|Rt)", tw.sadMouths, tw.happyMouths),

    # myleott: reversed version (: D:  use positive lookbehind to remove "(word):"
    # myleott: because eyes on the right side is more ambiguous with the standard usage of : ;
    tw.regex_or("(?<=(?: ))", "(?<=(?:^))") + tw.regex_or(tw.sadMouths,tw.happyMouths,tw.otherMouths) + tw.noseArea + tw.regex_or(tw.normalEyes, tw.wink) + "(?:<|)?",

    our_basicface,

    # myleott: o.O and O.o are two of the biggest sources of differences
    #          between this and the Java version. One little hack won't hurt...
    tw.oOEmote
)


pattern = unicode(pattern).decode('utf-8')
reg = re.compile(pattern, re.UNICODE)
# print reg

def get_emoticon_count(text):
    """
    Get an approximate number of emoticons contained in the input text.