예제 #1
0
    MYSQL_USER_NAME,
    MYSQL_PASSWORD
)

db = Connection(host = HOST_NAME,
                database = "jb",
                user = MYSQL_USER_NAME,
                password = MYSQL_PASSWORD
               )
db2 = Connection(host = HOST_NAME,
                 database = "jb_pure",
                 user = "******",
                 password = "******" 
                )

tweets = db.query("SELECT * FROM tweets")
count = 0
for t in tweets:
    c = t["tweet"].encode('utf-8')
    # language detection
    name, code, reliable, bytes_found, details = cld.detect(c)
    # compile a regex for urls. We don't want tweets with urls
    r = re.compile(r"(http://[^ ]+)")
    urlmatch = r.search(c)
    # we use a set to save tweets, and check against that to prevent duplicates
    saved = set()
    if (code == "en" or code == "un") and not urlmatch and c not in saved:
        # we allow 'unknown' languages into our database, as these are mostly short singlish sentences
        db2.execute("INSERT INTO tweets (user, tweet, location) VALUES (%s, %s, %s)", t["user"], c, t["location"])
        saved.add(c)
    else:
예제 #2
0
    res = []
    for (words, loc) in tweets:
        words_filtered = [w.lower() for w in words.split()] 
        words_filtered = [''.join(c for c in w if c not in string.punctuation) for w in words_filtered]
        words_filtered = filter(lambda x: x not in stopwords.words('english'), words_filtered)
        res.append((words_filtered, loc))
    return res


db = Connection(host = HOST_NAME,
                database = "jb_pure",
                user = "******",
                password = "******" 
               )

sg_users = db.query("SELECT id FROM users WHERE country='SG'")
jb_users = db.query("SELECT id FROM users WHERE country='JB'")
sg_tweets, jb_tweets = [], []
mtn = re.compile("@\w+")
hash = re.compile("#\w+")

for u in sg_users:
    curr = db.query("SELECT tweet FROM tweets WHERE user=%s", u["id"])
    curr = [hash.sub("", mtn.sub("", t['tweet'])).strip() for t in curr] # get rid of mentions
    [sg_tweets.append((t, "SG")) for t in curr]

for u in jb_users:
    curr = db.query("SELECT tweet FROM tweets WHERE user=%s", u["id"])
    curr = [hash.sub("", mtn.sub("", t['tweet'])).strip() for t in curr]
    [jb_tweets.append((t, "JB")) for t in curr]