示例#1
0
def pull_deleted(db, api, twitterapi, uid, nort=False):
    if uid:
        tweets = db.tweets.find({'deleted': True, 'user.id': uid})
    else:
        tweets = db.tweets.find({'deleted': True})
    if verbose():
        tweets = Bar("Processing:",
                     max=tweets.count(),
                     suffix='%(index)d/%(max)d - %(eta_td)s').iter(tweets)
    idlist = []
    for t in tweets:
        twid = t['id']
        if nort and 'retweeted_status' in t: continue
        idlist.append(twid)
        if len(idlist) == 100:
            add100(db, api, twitterapi, idlist)
            idlist = []
    if len(idlist):
        add100(db, api, twitterapi, idlist)
示例#2
0
def pull_favorited(db, api, twitterapi):
    favs = db.favorites.find({'pulled': None}).batch_size(100)
    idlist = []
    if verbose():
        favs = Bar("Processing:",
                   max=favs.count(),
                   suffix='%(index)d/%(max)d - %(eta_td)s').iter(favs)
    for f in favs:
        twid = f['tweet_id']
        if db.tweets.find_one({'id': twid}) is not None:
            db.favorites.update(f, {'$set': {'pulled': True}})
            continue
        idlist.append(twid)
        if verbose(): print " ", twid
        if len(idlist) == 100:
            add100(db, api, twitterapi, idlist)
            idlist = []
    if len(idlist):
        add100(db, api, twitterapi, idlist)
示例#3
0
def pull_quoted(db, api, twitterapi):
    tweets = db.tweets.find(
        {
            'quoted_status_id': {
                '$gt': 0
            },
            'quote_pulled': None
        }, {
            'quoted_status_id': 1,
            'quoted_status': 1,
            'id': 1
        })
    if verbose():
        tweets = Bar("Processing:",
                     max=tweets.count(),
                     suffix='%(index)d/%(max)d - %(eta_td)s').iter(tweets)
    idlist = []
    for t in tweets:
        twid = t['quoted_status_id']
        if twid is None:
            db.tweets.update(t, {'$unset': {'quoted_status_id': 1}})
            print("point 1: this should never? be reached, i think")
            continue
        #if get_tracked(db, uid=t['user']['id']) is None or not is_greek(db, uid=t['user']['id']): continue
        orig = db.tweets.find_one({'id': twid})
        if orig:
            if 'quoted_status' not in t:
                del orig['_id']
                db.tweets.update_one(t, {'$set': {'quoted_status': orig}})
                if verbose():
                    print(u"filled in tweet {} into {}".format(twid, t['id']))
            db.tweets.update(t, {'$set': {'quote_pulled': True}})
            continue
        if twid not in idlist:
            idlist.append(twid)
        if verbose(): print(" ", twid)
        if len(idlist) >= 100:
            add100(db, api, twitterapi, idlist)
            idlist = []
    if len(idlist):
        add100(db, api, twitterapi, idlist)
示例#4
0
    criteria = {}
    if options.before:
        criteria['$lte'] = dateutil.parser.parse(options.before)
    if options.after:
        criteria['$gt'] = dateutil.parser.parse(options.after)

    #edges = db.favorites.find({}, {'user_id':1, 'tweet_id':1}).sort('user_id', 1).batch_size(10)
    if options.before or options.after:
        tweets = db.tweets.find({'created_at': criteria}, {
            'id': 1,
            'user.id': 1
        })
        if verbose:
            tweets = Bar("Loading:",
                         max=tweets.count(),
                         suffix='%(index)d/%(max)d - %(eta_td)s').iter(tweets)
        tweets = list(tweets)
        edgecnt = scan_by_tweets(db, tweets)
        save_edgelist(db, edgecnt, options.filename, weight=True)
    elif options.user:
        uid = int(options.user) if options.ids else lookup_user(
            db, uname=options.user).get('id', -1)
        tweets = db.tweets.find({'user.id': uid}, {'id': 1, 'user.id': 1})
        tweets = list(tweets)
        edgecnt = scan_by_tweets(db, tweets)
        if options.dot:
            save_dot(db, edgecnt, options.filename, weight=True)
        else:
            save_edgelist(db, edgecnt, options.filename, weight=True)
    else:
示例#5
0
  auth = tweepy.OAuthHandler(config.consumer_key, config.consumer_secret)
  auth.set_access_token(config.access_token, config.access_token_secret)
  api = tweepy.API(auth)

  for user in args:
    uid = long(user) if options.ids else None
    uname = None if options.ids else user
    u = lookup_user(db, uid, uname)
    if u is None:
      print uid, uname, "not found"
    if options.scan:
      tweets = db.tweets.find({'user.id': u['id'], 'deleted': None}).sort('created_at', 1)
      idlist = []
      for t in tweets:
        idlist.append(t['id'])
        if len(idlist) == 100:
          idlist = add100(db, api, twitterapi, idlist)
          print u'found {} deleted'.format(len(idlist))
          idlist = []
      idlist = add100(db, api, twitterapi, idlist)
      print u'found {} deleted'.format(len(idlist))
      idlist = []

    tweets = db.tweets.find({'deleted': True, 'user.id': u['id']}).sort('created_at', 1)
    if verbose():
      tweets = Bar("Processing:", max=tweets.count(), suffix = '%(index)d/%(max)d - %(eta_td)s').iter(tweets)
    for t in tweets:
      if options.nort and 'retweeted_status' in t: continue
      print u'{} {} {}: {}'.format(t.get('id', '-'), t.get('created_at', None), u['screen_name_lower'], t.get('text', '<not found>')).encode('utf-8')

示例#6
0
if __name__ == '__main__':
    parser = optparse.OptionParser()
    parser.add_option('-v',
                      '--verbose',
                      action='store_true',
                      dest='verbose',
                      default=False,
                      help='List names of tracked users')
    (options, args) = parser.parse_args()
    verbose(options.verbose)
    db, api = init_state()
    users = db.following.find().batch_size(100)
    if verbose():
        users = Bar("Processing:",
                    max=users.count(),
                    suffix='%(index)d/%(max)d - %(eta_td)s').iter(users)
    for u in users:
        uid = u['id']
        us = lookup_user(db, uid)
        cdata = db.crawlerdata.find_one({'id': uid})
        d = datetime.utcnow().date()
        d = datetime(d.year, d.month, d.day)
        #if us.get('deleted', False):
        #print "User marked deleted. Skip."
        #continue
        if cdata.get('downloaded_profile_date',
                     datetime(1970, 01, 01, 00, 00,
                              00)) > (d - timedelta(days=30)):
            #if verbose(): print "Picture already downloaded. Skip."
            continue
示例#7
0
    if verbose():
      cursor = Bar('Loading:', max=db.tweets.count(), suffix = '%(index)d/%(max)d - %(eta_td)s').iter(cursor)
    for t in cursor:
      out_urls = []
      if 'urls' not in t or t['urls'] is None: continue
      for url in t['urls']:
        try:
          out_url = deshorten_url(db, url)
        except pymongo.errors.WriteError:
          continue
        if out_url:
          out_urls.append(out_url)
      db.tweets.update_one({'id': t['id']}, {'$set': {'deshorten': True}})
      #if len(out_urls):
        #db.tweets.update_one({'id': t['id']}, {'$set': {'urls': out_urls}})
  else:
    if options.user:
      cursor = db.users.find({'id': u['id'], 'url': {'$ne': None}}).batch_size(10)
    else:
      cursor = db.users.find({'url': {'$ne': None}}).batch_size(10)
    cnt = cursor.count()
    if verbose():
      cursor = Bar('Loading:', max=cnt, suffix = '%(index)d/%(max)d - %(eta_td)s').iter(cursor)
    print u'Found {}'.format(cnt)
    for u in cursor:
      url = u['url']
      out_url = deshorten_url(db, url)
      #if out_url:
        #db.users.update_one(u, {'$set': {'url': out_url}})

示例#8
0
 names = twkit.utils.cache['names']
 ids = twkit.utils.cache['ids']
 ignored = twkit.utils.cache['ign']
 dead = twkit.utils.cache['dead']
 suspended = twkit.utils.cache['susp']
 protected = twkit.utils.cache['prot']
 greek = twkit.utils.cache['gr']
 seen = Counter()
 unseen = Counter()
 cursor = db.tweets.find({'retweeted_status.lang': config.lang}, {
     'user': 1,
     'retweeted_status': 1
 })
 if verbose():
     cursor = Bar("Adding:",
                  max=cursor.count(),
                  suffix='%(index)d/%(max)d - %(eta_td)s ').iter(cursor)
 for tweet in cursor:
     whoid = tweet["user"]["id"]
     if whoid in dead: continue
     if whoid in ignored: continue
     if whoid in protected: continue
     if whoid in suspended: continue
     u = names.get(whoid, None)
     if u is None:
         u = lookup_user(db, uid=whoid)
         if u is None:
             unseen[whoid] += 1
             continue
     if options.user:
         if user != u['screen_name_lower']: continue
示例#9
0
                      action="store",
                      type="int",
                      dest="stopafter",
                      default=None,
                      help="Scan the given number of users")
    (options, args) = parser.parse_args()
    verbose(options.verbose)

    db, api = init_state()
    if options.suspended:
        userlist = db.suspended.find()
    else:
        userlist = db.protected.find()

    if options.stopafter:
        current = 0

    if verbose():
        userlist = Bar("Loading:",
                       max=userlist.count(),
                       suffix='%(index)d/%(max)d - %(eta_td)s').iter(userlist)

    for user in userlist:
        uid = long(user['id'])
        if not options.suspended and is_protected(db, uid): continue
        if options.suspended and is_suspended(db, uid): continue
        follow_user(db, api, uid=uid, wait=True, refollow=True)
        if options.stopafter:
            current += 1
            if current == options.stopafter: break
示例#10
0
                      dest='users',
                      default=False,
                      help='Also output user id.')
    (options, args) = parser.parse_args()
    db, _ = init_state(use_cache=False, ignore_api=True)
    verbose(options.verbose)

    criteria = defaultdict(lambda: {})

    if options.after:
        criteria['event_start'].update(
            {'$gte': dateutil.parser.parse(options.after)})
    if options.before:
        criteria['event_start'].update(
            {'$lte': dateutil.parser.parse(options.before)})

    botsfound = db.botsperweek.find(dict(criteria))
    if verbose():
        botsfound = Bar(
            "Loading:",
            max=botsfound.count(),
            suffix='%(index)d/%(max)d - %(eta_td)s').iter(botsfound)
    for v in botsfound:
        for tid in v['tweet_ids']:
            tw = db.tweets.find_one({'id': tid})
            if options.users:
                print(u'{} {}'.format(tw['user']['id'],
                                      tw['source']).encode('utf-8'))
            else:
                print(u'{}'.format(tw['source']).encode('utf-8'))
示例#11
0
parser = optparse.OptionParser()
parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, help='List names of tracked users')
parser.add_option('--vectorized', action='store_true', dest='vectorized', default=False, help='List only vectorized users.')
parser.add_option('--greek', action='store_true', dest='greek', default=False, help='List only greek users.')
(options, args) = parser.parse_args()

db,api = init_state(use_cache=False)

twittercounts = []
crawlercounts = []

if options.vectorized:
  vectors = db.uservectors.find({}, {'tweet_count': 1, 'seen_total': 1})
  if options.verbose:
    vectors = Bar("Processing:", max=vectors.count(), suffix = '%(index)d/%(max)d - %(eta_td)s').iter(vectors)
  for v in vectors:
    twittercounts.append(v['tweet_count'])
    crawlercounts.append(v['seen_total'])
elif options.greek:
  greeks = db.greeks.find().batch_size(1)
  if options.verbose:
    greeks = Bar("Processing:", max=greeks.count(), suffix = '%(index)d/%(max)d - %(eta_td)s').iter(greeks)
  for g in greeks:
    cursor = db.tweets.aggregate([
      { '$match': { 'user.id' : g['id'] } },
      { '$group':
        { '_id': '$user.id',
          'count': {'$sum': 1}
        }
      }],
示例#12
0
#!/usr/bin/python3
# -*- coding: utf-8 -*-
###########################################
# (c) 2016-2020 Polyvios Pratikakis
# [email protected]
###########################################

from collections import Counter
from progress.bar import Bar
from twkit.utils import *

if __name__ == '__main__':
  parser = optparse.OptionParser()
  parser.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False, help="make noise.")
  (options, args) = parser.parse_args()

  verbose(options.verbose)
  db, api = init_state(True)

  users = db.users.find({}, {'id':1})
  users = Bar("Processing:", max=users.count(), suffix = '%(index)d/%(max)d - %(eta_td)s').iter(users)
  counter = Counter()
  for u in users:
    counter[u['id']] += 1

  for c in sorted(counter):
    print(u'{} : {}'.format(c, counter[c]))