def feedModel(allUsernames, cla):
    global docCount, D, wordProb, wordSize
    for username in allUsernames:
        print username
        page = 1
        while page < 2:
            statuses = api.statuses.user_timeline(screen_name=username,
                                                  count=200,
                                                  page=page)
            if statuses:
                for tweet in statuses:
                    klass = cla  #spammer.cl  #parts[0]
                    attr = tweet['text']  #json.loads(parts[1])
                    docCount[klass] += 1
                    print docCount[klass]
                    for w in getFeatures(attr):
                        D.add(w)
                        wordProb[klass][w] += 1
                        wordSize[klass] += 1
                    print attr

                    # also, make sure to save the tweets in beautybiz_training_tweet table and, probably, in vocab in beautybiz_training_vocab table
                    check_id = TDB.session.query(
                        TDB.Beautybiz_training_tweet).filter(
                            TDB.Beautybiz_training_tweet.tweet_id ==
                            tweet['id']).all()
                    irtw = tokenize(tweet['text'])
                    thisisrt = ("1" if 'rt' in irtw else "0")
                    if (len(check_id) == 0):
                        tw = TDB.Beautybiz_training_tweet(
                            tweet_id=tweet['id'],
                            text=tweet['text'],
                            created_at=str(tweet['created_at']),
                            tweet_coords=str(
                                tweet['coordinates']
                            ),  # latitude-longitude of tweet location (nullifiable)
                            place=str(
                                tweet['place']
                            ),  # bounding box for places associated with/ mentioned in the tweet location as reported by the device (nullifiable)
                            isRT=thisisrt,
                            retweeted=tweet['retweeted'],
                            user_id=tweet['user']['id'],  # unique   
                            language=tweet['lang'],
                            cl=cla)
                        TDB.session.add(tw)

            else:
                # All done
                TDB.session.commit()
                break
            page += 1  # next page
            time.sleep(10)
        TDB.session.commit()
Exemplo n.º 2
0
                           CONSUMER_SECRET)
api = twitter.Twitter(auth=auth)

uoi = scan(TDB.session, TDB.Users)

for user in uoi:
    uid = user.id
    frnds = api.friends.ids(
        user_id=user.id
    )  # these are ids of the people that this user is following
    check_id = TDB.session.query(
        TDB.User_vs_class).filter(TDB.User_vs_class.user_id == uid).all()
    if (len(check_id) == 0):
        if ((len(frnds["ids"]) > 2500) and (len(frnds["ids"]) < 5000000)):
            uc = TDB.User_vs_class(user_id=uid,
                                   followers_count=len(frnds),
                                   cl='B')
        elif (len(frnds["ids"]) >= 5000000):
            uc = TDB.User_vs_class(user_id=uid,
                                   followers_count=len(frnds),
                                   cl='C')
        else:
            uc = TDB.User_vs_class(user_id=uid,
                                   followers_count=len(frnds),
                                   cl='S')

        TDB.session.add(uc)

    if (len(frnds["ids"]) < 2500):
        for n in range(0, len(frnds["ids"]), 100):
            frnds_ids = frnds["ids"][n:n + 100]
Exemplo n.º 3
0
 while page < 10:
     statuses = api.statuses.user_timeline(screen_name=username,
                                           count=200,
                                           page=page)
     if statuses:
         for tweet in statuses:
             utl.append(tweet)
             print "(%s) %s" % (tweet["created_at"], tweet["text"])
             # fill tabe with user id of this user, tweet id of EACH tweet in his/her timeline, and time of tweet.
             check_id = TDB.session.query(TDB.User_timeline).filter(
                 TDB.User_timeline.user_id == tweet['user']['id']).filter(
                     TDB.User_timeline.tweet_id == tweet['id']).all()
             if (len(check_id) == 0):
                 ut = TDB.User_timeline(user_id=tweet['user']['id'],
                                        tweet_id=tweet['id'],
                                        date_time_str=str(
                                            tweet['created_at']),
                                        topic='cosmetics')
                 TDB.session.add(ut)
             # add tweeet details to tweet table
             check_id = TDB.session.query(
                 TDB.Tweet).filter(TDB.Tweet.id == tweet['id']).all()
             if (len(check_id) == 0):
                 tw = TDB.Tweet(
                     id=tweet['id'],
                     text=tweet['text'],
                     created_at=str(tweet['created_at']),
                     tweet_coords=str(tweet['coordinates']),
                     place=str(tweet['place']),
                     retweeted=tweet['retweeted'],
                     user_id=tweet['user']['id'],
Exemplo n.º 4
0
            print kw.id, " ", kw.priority, " ", kw.keyword
            res = api.search.tweets(q=kw.keyword, count=100)
            pickle.dump(res, fh)
            #    #print res
            for tweet in res['statuses']:
                # user table
                #                print 'no'
                tu = tweet['user']
                check_id = TDB.session.query(
                    TDB.User).filter(TDB.User.id == tu['id']).all()
                if (len(check_id) == 0):
                    u = TDB.User(
                        id=tu['id'],
                        name=tu['name'],
                        location=tu['location'],
                        created_at=tu['created_at'],
                        friends_count=tu['friends_count'],
                        followers_count=tu['followers_count'],
                        following=tu['following'],
                    )
                    TDB.session.add(u)

                # tweet table
                check_id = TDB.session.query(
                    TDB.Tweet).filter(TDB.Tweet.id == tweet['id']).all()
                if (len(check_id) == 0):
                    tw = TDB.Tweet(
                        id=tweet['id'],
                        text=tweet['text'],
                        created_at=str(tweet['created_at']),
                        tweet_coords=str(tweet['coordinates']),
Exemplo n.º 5
0
                this_cl = 'B'
                check_id = TDB.session.query(TDB.Beauty_tweet).filter(
                    TDB.Beauty_tweet.tweet_id == tweet['id']).all()
                if (len(check_id) == 0):
                    print tweet['text']
                    isCosTw = raw_input('Is this tweet a cosmetics tweet? : ')
                    print 'ur answer  is :   ', isCosTw
                    if ((isCosTw == 'n') or (isCosTw == 'no')):
                        print 'ur answer  is :   ', isCosTw
                        this_cl = 'O'

                    print 'this_cl = ', this_cl
                    # fill this tweet in beauty_tweet table:
                    bt = TDB.Beauty_tweet(tweet_id=tweet['id'],
                                          text=tweet['text'],
                                          cl=this_cl,
                                          user_id=tweet['user']['id'],
                                          user_screen_name=username)

                    TDB.session.add(bt)
                    TDB.session.commit()

# Alternatively:
#tl = TDB.scan(TDB.session, TDB.User_timeline)
#for user in uoi:
#    # get his uid
#    # get his tids
#    for tid in tids:

#    for tw in tweets:
#        print " happy anniversary hon!!! was beautiful!!!! "
Exemplo n.º 6
0
     while page < 17:
 #        statuses = api.statuses.user_timeline(screen_name = username, count = 200, page=page)
         statuses = api.statuses.user_timeline(user_id = username, count = 200, page=page)
         if statuses:
 #             add user details to user table
             if userAddedCtr == 0:
                 tu= statuses[0]['user']
                 vf = "n"
                 check_id = TDB.session.query(TDB.Users).filter(TDB.Users.id==tu['id']).all()
                 if(len(check_id)== 0):
                     if tu["verified"]:
                         verified = "y"
                     u = TDB.Users(id = tu['id'], 
                                  name=tu['name'], 
                                 screen_name = tu['screen_name'],
                                  location=tu['location'], 
                                  created_at=tu['created_at'],
                                  language = tu['lang'], 
                                  verified = vf,
                                  url = str(tu['url']))
                     TDB.session.add(u)
                     userAddedCtr += 1
             for tweet in statuses:
 #                utl.append(tweet)
 #                print "(%s) %s" % (tweet["created_at"], tweet["text"])
                 # fill tabe with user id of this user, tweet id of EACH tweet in his/her timeline, and time of tweet.
                 check_id = TDB.session.query(TDB.User_timeline).filter(TDB.User_timeline.user_id == tweet['user']['id']).filter(TDB.User_timeline.tweet_id==tweet['id']).all()
                 if (len(check_id) == 0):
                     ut = TDB.User_timeline(user_id = tweet['user']['id'],
                                            tweet_id = tweet['id'],
                                             date_time_str = str(tweet['created_at']))
                     TDB.session.add(ut)
                #print "effect ", poslabel[sorted_idx[i+1]], " on ", all_dates[sorted_idx[i+1]]
                #print "delta ", delta.days
                cause = poslabel[sorted_idx[i]]
                effect = poslabel[sorted_idx[i + 1]]
                cause_effect[cause][effect] += 1
                print cause, "  ", effect
        new_event = 0

    da = TDB.Dependency_analysis(
        user_id=user_id,
        num_biz_following=get_company_followed_count(user_id),
        bt=num_beauty_tweets,
        rt=num_regret_tweets,
        st=num_shop_tweets,
        mt=num_money_tweets,
        snst=num_sorrynosorry_tweets,
        ot=num_other_tweets,
        btor=cause_effect['beauty']['regret'],
        btos=cause_effect['beauty']['shop'],
        btom=cause_effect['beauty']['money'],
        btosns=cause_effect['beauty']['sorrynosorry'],
        stor=cause_effect['shop']['regret'],
        mtor=cause_effect['money']['regret'],
    )

    TDB.session.add(da)
    TDB.session.commit()
    #print "company followed count: ", get_company_followed_count(user_id)
    #for i in range(0,5):
    #    for j in range(i+1, 5):

    # for atweet in scan(tweet_by_time):
# -*- coding: utf-8 -*-
"""
Created on Sat Nov 29 18:36:27 2014

@author: rishu
"""

import TweetDB as TDB
from EpiPipesApi import *

count = 0
tws = scan(TDB.session, TDB.Tweet.id)
for tw in tws:
    check_id = TDB.session.query(
        TDB.Topic_Tweetid).filter(TDB.Topic_Tweetid.tweet_id == tw.id).all()
    count += 1
    print count
    if (len(check_id) == 0):
        u = TDB.Topic_Tweetid(topic='cosmetics', tweet_id=tw.id)
        TDB.session.add(u)

    print count

TDB.session.commit()
Exemplo n.º 9
0
"""
import sys
import TweetDB
from configsreader import getconfigs
from TwitterAPI import TwitterAPI
from TweetClasses import Tweet

CONFIGS = getconfigs(sys.argv[1])

api = TwitterAPI(CONFIGS['consumerkey'],       
                 CONFIGS['consumersecret'],
                 CONFIGS['accesstokenkey'],
                 CONFIGS['accesstokensecret'])

def search(query,feed='search/tweets',api=api, n=100):
    return [Tweet(t) for t in api.request(feed, {'q':query,
                                                 'count': n})]

terms = ('hillary','trump')       

tweets = []
for term in terms:
    tweets += search(term)

db = TweetDB.sqlconnect("./tweets.db")

for tweet in tweets:
    TweetDB.add(tweet,db)

db.commit()
db.close()
Exemplo n.º 10
0
#        for kw in lkw:
#            counter += 1
#            check_id = TDB.session.query(TDB.Spam_words).filter(TDB.Spam_words.word_id==kw).all()
#            if (len(check_id) == 0):
#                u = TDB.Spam_words(word_id = kw, id= counter)
#                TDB.session.add(u)
#                print kw
#            else:
#                print kw , 'already present'
#
#
#TDB.session.commit()

counter = 0
with open('/home/rishu/Courses/CS5525/project/makeupBrands.txt', 'rb') as f:
    for line in f:
        lkw = line.split()
        print lkw
        for kw in lkw:
            counter += 1
            check_id = TDB.session.query(TDB.Beauty_businesses).filter(
                TDB.Spam_words.word_id == kw).all()
            if (len(check_id) == 0):
                u = TDB.Spam_words(word_id=kw)
                TDB.session.add(u)
                print kw
            else:
                print kw, 'already present'

TDB.session.commit()
Exemplo n.º 11
0
""" Cleans the ./tweets.db file by removing duplicate tweets.
Duplicate tweets are determined by rowid, not by content.
"""
import datetime, TweetDB

db = TweetDB.sqlconnect("./tweets.db")

db.cursor().execute("""
    DELETE
    FROM tweets
    WHERE   rowid NOT IN
            (
            SELECT  MIN(rowid)
            FROM    tweets
            GROUP BY twid
            ) 
    """)

now = datetime.datetime.now()

todaystweets = db.cursor().execute("""
    SELECT count(rowid)
    FROM tweets
    WHERE created
        LIKE '{}'
    """.format(now.strftime("%b %d %Y"))).fetchone()[0]

print("Run on {}. {} tweets added today.".\
       format(now.strftime("%c"),todaystweets))

db.commit()
Exemplo n.º 12
0
# -*- coding: utf-8 -*-
"""
Created on Sun Dec  7 08:54:14 2014

@author: rishu
"""

import pygal
import TweetDB as TDB
from EpiPipesApi import *

pytweets = []
users = TDB.scan(TDB.session, TDB.User)
# Use Python collection for counting frequency

user_count = Counter()
for usr in users:
    user_count[usr['id']] += 1


# Prepare the SVG Plot

user_tweet = fuse(TDB.session, TDB.User, TDB.Tweet_mirror, 'id', 'user_id')
barplot = pygal.HorizontalBar( style=pygal.style.SolidColorStyle )

topnum = 10
for i in range(topnum):
    barplot.add( user_count.most_common(topnum)[i][0], \
              [ { 'value': user_count.most_common(topnum)[i][1], \
                  'label':user_count.most_common(topnum)[i][0]} ] )
Exemplo n.º 13
0
                for user in sq:
                    if ((user['protected'] == False) and
                        ((user['lang'] == 'en') or (user['lang'] == 'en-gb') or
                         (user['lang'] == 'en-AU') or
                         (user['lang'] == 'en-GB') or
                         (user['lang'] == 'en-au'))):
                        vf = "n"
                        check_id = TDB.session.query(TDB.Users).filter(
                            TDB.Users.id == user['id']).all()
                        if (len(check_id) == 0):
                            if user["verified"]:
                                vf = "y"
                            u = TDB.Users(id=user['id'],
                                          name=user['name'],
                                          screen_name=user['screen_name'],
                                          location=user['location'],
                                          created_at=user['created_at'],
                                          language=user['lang'],
                                          verified=vf,
                                          url=str(user['url']))
                            TDB.session.add(u)
                        else:
                            print 'already present in users table'

                        check_id = TDB.session.query(
                            TDB.User_vs_company).filter(
                                TDB.User_vs_company.user_id == user['id']
                            ).filter(
                                TDB.User_vs_company.company_id == cid).all()
                        if (len(check_id) == 0):
                            uc = TDB.User_vs_company(
                                user_id=user['id'],
"""
Created on Tue Dec 16 22:14:55 2014

@author: rishu
"""

from datetime import datetime
from dateutil import parser
from EpiPipesApi import *
import TweetDB as TDB
ctr = 1

for rec in TDB.engine.execute('select * from tweet_time_tmp'):
 #for rec in TDB.session.query(TDB.User_timeline).yield_per(10000):
dt = parser.parse(rec.date_time_str)
d = TDB.Tweet_time(tweet_id= rec.tweet_id, date=dt)
# c = cardinality(TDB.session, filterEQ(TDB.session, TDB.Tweet_time, 'tweet_id', rec.tweet_id))
 #print rec.tweet_id, " ", c
 c = 0
 if c == 0:
  TDB.session.add(d)
  ctr = ctr + 1
  print ctr
 if ctr% 10000 == 0:
   TDB.session.commit()
   print ctr

TDB.session.commit()

#for each timeline
#order tweets by date
Exemplo n.º 15
0
    check_id = TDB.session.query(TDB.Words_in_tweet).filter(
        TDB.Words_in_tweet.tweet_id == u.tweet_id).all()
    if (len(check_id) == 0):
        t = scan(TDB.session, filterEQ(TDB.session, TDB.Tweet, 'id',
                                       u.tweet_id))
        #start1 = time.time()
        [hts, ct, ur, wl, dp] = processTweet(t[0].text, regexp, urldelims,
                                             sentenceEnders)
        #print "PTime ", time.time() - start1
        #        hhts = []
        #        for h in hts:
        #            hhts.append('#' + h)
        finalWordList = wl + hts
        for word in finalWordList:
            #if (len(check_id) == 0):
            wid = TDB.Words_in_tweet(tweet_id=t[0].id, word_id=word)
            TDB.session.add(wid)
    if (session_ctr % 1000 == 0):
        TDB.session.commit()
        print "Time ", time.time() - start
        start = time.time()
    session_ctr += 1

#
#tweet_keywords = fuse(TDB.session, TDB.Words_in_tweet, TDB.Keywords, 'word_id', 'ketword')
#tweet_keywords_count = aggregate(TDB.session, )
#kws = scan(TDB.session, TDB.Keywords)
#for kw in kws:

#tweets = scan(TDB.session, TDB.Tweet)
#print len(tweets)