示例#1
0
def type_clean(val,type):
  if type==bool:
    if val in (False,0,'0','f','false','False','no','n'): return False
    if val in (True,1,'1','t','true','True','yes','y'): return True
    raise Exception("bad bool value %s" % repr(val))
  if type==str or type==unicode:
    # nope no strings, you're gonna get unicode instead!
    return util.unicodify(val)
  return type(val)
示例#2
0
def type_clean(val, type):
    if type == bool:
        if val in (False, 0, '0', 'f', 'false', 'False', 'no', 'n'):
            return False
        if val in (True, 1, '1', 't', 'true', 'True', 'yes', 'y'): return True
        raise Exception("bad bool value %s" % repr(val))
    if type == str or type == unicode:
        # nope no strings, you're gonna get unicode instead!
        return util.unicodify(val)
    return type(val)
示例#3
0
def truncate_at(s, max=40):
  s = util.unicodify(s)
  if len(s) > max:
    s = s[:max] + "…"
  return s
示例#4
0
def output(s):
  print util.unicodify(s)
示例#5
0
import sys
sys.path.insert(0, '/usr2/corpora/tweets/tweetmotif')
import twokenize, util, bigrams
util.fix_stdio()
from sane_re import *

AposFix = _R(r"( |^)(' [stm])( |$)")

for line in sys.stdin:
    parts = util.unicodify(line[:-1]).split("\t")
    text = parts[-1]
    toks = twokenize.simple_tokenize(text)
    toked = " ".join(toks)
    #print "\t".join(parts[:-1]) + "\t" + toked
    #try: AposFix.show_match(toked)
    #except: pass
    featstr = AposFix.gsub(toked,
                           lambda m: m[1] + m[2].replace(" ", "") + m[3])
    featstr = featstr.lower()
    toks = featstr.split()
    feats = [ug[0] for ug in bigrams.filtered_unigrams(toks)]
    feats += ["_".join(ng) for ng in bigrams.filtered_bigrams(toks)]

    print "\t".join(parts[:-1]) + "\t" + util.unicodify(" ".join(feats))
示例#6
0
from __future__ import print_function
import json
import tweepy
import datetime  # used for time stamping program start/stop
import traceback

from util import unicodify

# Attributes I don't want
UNWANTED_ATTR = [
    "contributors", "current_user_retweet", "favorited", "geo", "id",
    "in_reply_to_status_id", "lang", "quoted_status_id", "retweeted", "source",
    "in_reply_to_user_id", "entities", "extended_entities",
    "in_reply_to_screen_name"
]
UNWANTED_ATTR = unicodify(UNWANTED_ATTR)

# Nested attributes that I want
WANTED_NESTED_ATTR = {
    "user": [
        "followers_count", "friends_count", "geo_enabled", "id_str",
        "location", "protected", "time_zone", "statuses_count", "created_at"
    ],
    "extended_tweet": ["full_text"]
}
WANTED_NESTED_ATTR = unicodify(WANTED_NESTED_ATTR)


def trim_tweet(tweet_json):
    """Trims and returns a tweet (JSON object).
示例#7
0
import sys
sys.path.insert(0,'/usr2/corpora/tweets/tweetmotif')
import twokenize,util,bigrams
util.fix_stdio()
from sane_re import *

AposFix = _R(r"( |^)(' [stm])( |$)")


for line in sys.stdin:
  parts = util.unicodify(line[:-1]).split("\t")
  text = parts[-1]
  toks = twokenize.simple_tokenize(text)
  toked = " ".join(toks)
  #print "\t".join(parts[:-1]) + "\t" + toked
  #try: AposFix.show_match(toked)
  #except: pass
  featstr = AposFix.gsub(toked, lambda m: m[1]+m[2].replace(" ","")+m[3])
  featstr = featstr.lower()
  toks = featstr.split()
  feats = [ug[0] for ug in bigrams.filtered_unigrams(toks)]
  feats += ["_".join(ng) for ng in bigrams.filtered_bigrams(toks)]

  print "\t".join(parts[:-1]) + "\t" + util.unicodify(" ".join(feats))