Exemplo n.º 1
0
#note that coordinate pairs are long/lat not lat/long
#Singapore coordinates
locations = ["103.61,1.22","104.01356,1.456674"]

#Johor coordinates
#locations = ["103.55,1.45", "103.87,1.63"]

with tweetstream.FilterStream(
                SECOND_USERNAME,
                SECOND_PASSWORD,
                locations=locations) as stream:
    for tweet in stream:
        db = Connection(host = HOST_NAME,
                        database = SG_MYSQL_DB_NAME,
                        user = SG_MYSQL_USER_NAME,
                        password = SG_MYSQL_PASSWORD
                )
        username = tweet["user"]["screen_name"]
        text = tweet["text"]
        loc = tweet["place"]["full_name"]

        user = db.get("SELECT id FROM users WHERE username=%s", username)
        if user:
            db.execute("INSERT into tweets (user, tweet, location) VALUES (%s, %s, %s)", user["id"], text, loc)
        else:
            db.execute("INSERT into users (username) VALUES (%s)", username)
            user = db.get("SELECT id FROM users WHERE username=%s", username)
            db.execute("INSERT into tweets (user, tweet, location) VALUES (%s, %s, %s)", user["id"], text, loc)

        db.close()
Exemplo n.º 2
0
import cld
import re
from sgbeat.database import Connection
from details import (
    USERNAME,
    PASSWORD,
    HOST_NAME,
    MYSQL_USER_NAME,
    MYSQL_PASSWORD
)

db = Connection(host = HOST_NAME,
                database = "jb",
                user = MYSQL_USER_NAME,
                password = MYSQL_PASSWORD
               )
db2 = Connection(host = HOST_NAME,
                 database = "jb_pure",
                 user = "******",
                 password = "******" 
                )

tweets = db.query("SELECT * FROM tweets")
count = 0
for t in tweets:
    c = t["tweet"].encode('utf-8')
    # language detection
    name, code, reliable, bytes_found, details = cld.detect(c)
    # compile a regex for urls. We don't want tweets with urls
    r = re.compile(r"(http://[^ ]+)")
    urlmatch = r.search(c)
Exemplo n.º 3
0
def filter_words(tweets):
    """Get rid of stop words
    """
    res = []
    for (words, loc) in tweets:
        words_filtered = [w.lower() for w in words.split()] 
        words_filtered = [''.join(c for c in w if c not in string.punctuation) for w in words_filtered]
        words_filtered = filter(lambda x: x not in stopwords.words('english'), words_filtered)
        res.append((words_filtered, loc))
    return res


db = Connection(host = HOST_NAME,
                database = "jb_pure",
                user = "******",
                password = "******" 
               )

sg_users = db.query("SELECT id FROM users WHERE country='SG'")
jb_users = db.query("SELECT id FROM users WHERE country='JB'")
sg_tweets, jb_tweets = [], []
mtn = re.compile("@\w+")
hash = re.compile("#\w+")

for u in sg_users:
    curr = db.query("SELECT tweet FROM tweets WHERE user=%s", u["id"])
    curr = [hash.sub("", mtn.sub("", t['tweet'])).strip() for t in curr] # get rid of mentions
    [sg_tweets.append((t, "SG")) for t in curr]

for u in jb_users: