def check(): api = twitter.Api(consumer_key=ta['consumer_key'], consumer_secret=ta['consumer_secret'], access_token_key=ta['access_token'], access_token_secret=ta['access_token_secret']) statuses = api.GetUserTimeline(screen_name='realdonaldtrump', trim_user=True, count=5) total = 0 for s in statuses: logging.debug("Reading tweet {}: {}".format(s.id, s.text)) if 'fake news' in s.text.lower(): if is_in_db(s.id) == False: donation = calculate_donation(s.text) total = total + donation donation_success = propublica.donate(donation, s.id) record = models.Tweet( text = s.text, tid = s.id, donation = donation, donation_successful = donation_success) record.put() else: break return total
def SaveToDataStore(self, campaignId, username, tweetText): logging.info('Save to dataStore') tweet = models.Tweet() tweet.text = tweetText tweet.userName = username tweet.campaignId = campaignId tweet.put()
def add_tweet(): """ Adds tweet to the application, expects "content" and "username" from the request body. :return: A dictionary of the form {'id': the id number of the newly inserted tweet} """ tweet = models.Tweet(text_content=request.json['content'], username=request.json['username'], timestamp=datetime.datetime.now()) db.session.add(tweet) db.session.commit() return {'id': tweet.id}
def _insert(self, service, created_at, message, status, tweet_id): key = '{}-{}'.format(service, created_at) tw = models.Tweet.all().filter('key_name=', key).fetch(1) if not tw: tw = models.Tweet( service=service, created_at=created_at, message=message, status=status, key_name=key, tweet_id=tweet_id, ) tw.put()
def pw2dj(tables=((User, models.User), (Place, models.Place), (Tweet, models.Tweet)), delete_first=True, batch_size=10000): """Copies all records from peewee sqlite database to Django postgresql database, ignoring ForeignKeys This worked and also migrated foreign keys! (only 217 in_reply_to tweets out of 240k though) """ for from_cls, to_cls in tables: print('=' * 100) print('Copying {} -> {}'.format(from_cls, to_cls)) if delete_first: M = to_cls.objects.count() print('Deleting {} {} records'.format(M, to_cls)) to_cls.objects.all().delete() assert (to_cls.objects.count() == 0) query = from_cls.select() N = query.count() records = [] for i, obj in enumerate(query): d = model_to_dict(obj) if isinstance(obj, models.Tweet): if d['in_reply_to'] is not None and len(d['in_reply_to']) > 0: to_cls.in_reply_to = models.Tweet(**d['in_reply_to']) for k, v in d.iteritems(): # only works for foreign keys to self if isinstance(v, dict) and not len(v): d[k] = None else: # FIXME: come back later and fill in foreign keys: in_reply_to, place, user d[k] = None records += [to_cls(**d)] if not i % batch_size: assert (from_cls.select().count() == N) print('Saving {:08d}/{:08d} {}: {}'.format( i, N, round(i * 100. / N, 1), obj)) # this will not work for many2many fields to_cls.objects.bulk_create(records) records = [] if len(records): print('Saving last batch {:08d}/{:08d} {}: {}'.format( i, N, round(i * 100. / N, 1), obj)) # this will not work for many2many fields to_cls.objects.bulk_create(records) records = []
def saveTweets(db_session, tweets, terms): """Function that stores tweets in database""" tweet = models.Tweet() for row_dict in tweets: string_date = row_dict.get('date') tweet_date = datetime.strptime(string_date[:19] + string_date[-5:], '%a %b %d %H:%M:%S %Y') tweet.tweet_id = row_dict.get('id') tweet.tweet_language = row_dict.get('language') tweet.tweet_profile = row_dict.get('profile') tweet.tweet_author = row_dict.get('author') tweet.tweet_url = row_dict.get('url') tweet.tweet_text = row_dict.get('text') tweet.tweet_country = terms[0] tweet.tweet_search_word = terms[1] tweet.tweet_date = tweet_date db_session.merge(tweet) db_session.commit() return True
def insert_tweets(tweets): for tweet in tweets: lat, long = extract_point(tweet.geo) t = models.Tweet(tweet_id=tweet.id, text=tweet.text, created_at=tweet.created_at, geo_lat=lat, geo_long=long, user_id=tweet.user.id, screen_name=tweet.user.screen_name, retweet_count=tweet.retweet_count, favorite_count=tweet.favorite_count) for hashtag in tweet.hashtags: association = models.TweetHashtagAssociation() association.hashtag = models.Hashtag(hashtag_text=hashtag.text) t.hashtags.append(association) session.add(t) session.commit()
def getTweets(worker, receiveBuffer=None, bufferLength=500, proxyURL=None): cursorHash = '' results = [] resultsAux = [] cookie = '' active = True formSearchCriteria = worker.formSearchCriteria formExportOptions = worker.formExportOptions formProxyOptions = worker.formProxyOptions url = SearchController.constructURL(formSearchCriteria) randomUserAgent = models.UserAgents.getRandomUserAgent() while active and worker.running(): searchResults = SearchController.getTweetSearchResults( url, cursorHash, cookie, proxyURL, randomUserAgent) if (searchResults == None): worker.sgnOutput.emit('Connection Error!') break if (searchResults and len(searchResults['items_html'].strip()) == 0): break cursorHash = searchResults['min_position'] scrapedTweets = PyQuery(searchResults['items_html']).remove('div.withheld-tweet') tweets = scrapedTweets('div.js-stream-tweet') if len(tweets) == 0: break for tweetHTML in tweets: if worker.running() == False: break tweetPQ = PyQuery(tweetHTML) tweet = models.Tweet() tweetPermalink = tweetPQ.attr("data-permalink-path") if(formExportOptions.dateTime): tweet.setDateTime(datetime.datetime.fromtimestamp(int( tweetPQ("small.time span.js-short-timestamp").attr("data-time"))).strftime("%Y-%m-%d %H:%M:%S")) if(formExportOptions.id): tweet.setId(tweetPQ.attr("data-tweet-id")) if(formExportOptions.permalink): tweet.setPermalink('https://twitter.com' + tweetPermalink) if(formExportOptions.posterUsername): tweet.setPosterUsername('@' + re.split('/', tweetPermalink)[1]) if(formExportOptions.posterProfileName or formExportOptions.posterNumberOfFollowers): rData = SearchController.getFollowersCountAndProfileName(tweet.posterUsername, cookie, proxyURL, randomUserAgent) if(formExportOptions.posterProfileName): tweet.setPosterProfileName(rData[1]) if(formExportOptions.posterNumberOfFollowers): tweet.setPosterNumberOfFollowers(rData[0]) # tweet.language = tweetPQ("p.js-tweet-text").attr("lang") tweetTextHtml = tweetPQ("p.js-tweet-text").outerHtml() tweetTextHtmlWithEmojis = re.sub(r"<img.*?alt=\"(.*?)\"[^\>]+>", r'\1', tweetTextHtml) tweetTextHtmlWithEmojis = re.sub( r"\s+", " ", tweetPQ(tweetTextHtmlWithEmojis).text()) tweet.setText(tweetTextHtmlWithEmojis) if(formExportOptions.numberOfRetweets): tweet.setNumberOfRetweets(int(tweetPQ("span.ProfileTweet-action--retweet span.ProfileTweet-actionCount").attr( "data-tweet-stat-count").replace(",", ""))) if(formExportOptions.isARetweetStatus): tweet.setIsARetweetStatus('Retweet' if len(tweetPQ("div.QuoteTweet")) > 0 else 'Tweet') results.append(tweet) resultsAux.append(tweet) if receiveBuffer and len(resultsAux) >= bufferLength: receiveBuffer(resultsAux) resultsAux = [] randomUserAgent = models.UserAgents.getRandomUserAgent() if receiveBuffer and len(resultsAux) > 0: receiveBuffer(resultsAux)
r = urlfetch.fetch(url) if r.status_code == 200: return minidom.parseString(r.content) newtweets = 0 sps = models.SiteProperties.all().fetch(1) for sp in sps: twitteruser = sp.twitteruser dom = parse('http://twitter.com/statuses/user_timeline.xml?screen_name=' + twitteruser) for tw in dom.getElementsByTagName('status'): ttext = convertlinks(tw.getElementsByTagName('text')[0].firstChild.data) tid = tw.getElementsByTagName('id')[0].firstChild.data tdate = tw.getElementsByTagName('created_at')[0].firstChild.data if (db.GqlQuery("SELECT * FROM Tweet WHERE id = :twid", twid=tid).get() == None): newtweet = models.Tweet() newtweet.id = tid newtweet.date = datetime.datetime.strptime( string.replace(tdate, ' +0000', ''), '%a %b %d %H:%M:%S %Y') newtweet.content = ttext newtweet.put() newtweets += 1 if newtweets > 0: print '<html>Added ' + str(newtweets) + ' New Tweets</html>' else: print '<html>No New Tweets</html>'
def getTweets(tweetCriteria, receiveBuffer=None, bufferLength=100, proxy=None): refreshCursor = '' results = [] resultsAux = [] cookieJar = http.cookiejar.CookieJar() if hasattr(tweetCriteria, 'username') and ( tweetCriteria.username.startswith("\'") or tweetCriteria.username.startswith("\"")) and ( tweetCriteria.username.endswith("\'") or tweetCriteria.username.endswith("\"")): tweetCriteria.username = tweetCriteria.username[1:-1] active = True while active: json = TweetManager.getJsonReponse(tweetCriteria, refreshCursor, cookieJar, proxy) if len(json['items_html'].strip()) == 0: break refreshCursor = json['min_position'] tweets = PyQuery(json['items_html'])('div.js-stream-tweet') if len(tweets) == 0: break for tweetHTML in tweets: tweetPQ = PyQuery(tweetHTML) tweet = models.Tweet() usernameTweet = tweetPQ("span:first.username.u-dir b").text() txt = re.sub( r"\s+", " ", tweetPQ("p.js-tweet-text").text().replace('# ', '#').replace( '@ ', '@')) retweets = int( tweetPQ( "span.ProfileTweet-action--retweet span.ProfileTweet-actionCount" ).attr("data-tweet-stat-count").replace(",", "")) favorites = int( tweetPQ( "span.ProfileTweet-action--favorite span.ProfileTweet-actionCount" ).attr("data-tweet-stat-count").replace(",", "")) dateSec = int( tweetPQ("small.time span.js-short-timestamp").attr( "data-time")) id = tweetPQ.attr("data-tweet-id") permalink = tweetPQ.attr("data-permalink-path") geo = '' geoSpan = tweetPQ('span.Tweet-geo') if len(geoSpan) > 0: geo = geoSpan.attr('title') tweet.id = id tweet.permalink = 'https://twitter.com' + permalink tweet.username = usernameTweet tweet.text = txt tweet.date = datetime.datetime.fromtimestamp(dateSec) tweet.retweets = retweets tweet.favorites = favorites tweet.mentions = " ".join( re.compile('(@\\w*)').findall(tweet.text)) tweet.hashtags = " ".join( re.compile('(#\\w*)').findall(tweet.text)) tweet.geo = geo results.append(tweet) resultsAux.append(tweet) if receiveBuffer and len(resultsAux) >= bufferLength: receiveBuffer(resultsAux) resultsAux = [] if tweetCriteria.maxTweets > 0 and len( results) >= tweetCriteria.maxTweets: active = False break if receiveBuffer and len(resultsAux) > 0: receiveBuffer(resultsAux) return results
def getTweets(tweetCriteria, receiveBuffer=None, bufferLength=100, proxy=None, debug=True, proxies=None): """Get tweets that match the tweetCriteria parameter A static method. Parameters ---------- tweetCriteria : tweetCriteria, an object that specifies a match criteria receiveBuffer : callable, a function that will be called upon a getting next `bufferLength' tweets bufferLength: int, the number of tweets to pass to `receiveBuffer' function proxy: str, a proxy server to use debug: bool, output debug information proxies: list of proxy urls """ results = [] resultsAux = [] cookieJar = http.cookiejar.CookieJar() user_agent = random.choice(TweetManager.user_agents) debug = True # proxies = getProxies() all_usernames = [] usernames_per_batch = 20 # fp = FreeProxy() if hasattr(tweetCriteria, 'username'): if type(tweetCriteria.username) == str or not hasattr(tweetCriteria.username, '__iter__'): tweetCriteria.username = [tweetCriteria.username] usernames_ = [u.lstrip('@') for u in tweetCriteria.username if u] all_usernames = sorted({u.lower() for u in usernames_ if u}) n_usernames = len(all_usernames) n_batches = n_usernames // usernames_per_batch + (n_usernames % usernames_per_batch > 0) else: n_batches = 1 batch: int for batch in range(n_batches): # process all_usernames by batches refreshCursor = '' batch_cnt_results = 0 proxies = fp.get_proxies() if all_usernames: # a username in the criteria? tweetCriteria.username = all_usernames[ batch * usernames_per_batch:batch * usernames_per_batch + usernames_per_batch] active = True print('using %s for batch %d' % (proxy, batch)) while active: try: proxy = random.choice(proxies) json = TweetManager.getJsonResponse(tweetCriteria, refreshCursor, cookieJar, proxy, user_agent, debug=debug) except Exception as e: print(e) print('json could not be downloaded ') # break pass if len(json['items_html'].strip()) == 0: # break pass refreshCursor = json['min_position'] scrapedTweets = PyQuery(json['items_html']) # Remove incomplete tweets withheld by Twitter Guidelines scrapedTweets.remove('div.withheld-tweet') tweets = scrapedTweets('div.js-stream-tweet') if len(tweets) == 0: # break pass for tweetHTML in tweets: tweetPQ = PyQuery(tweetHTML) tweet = models.Tweet() usernames = tweetPQ("span.username.u-dir b").text().split() if not len(usernames): # fix for issue #13 continue tweet.username = usernames[0] tweet.to = usernames[1] if len(usernames) >= 2 else None # take the first recipient if many tweet.text = re.sub(r"\s+", " ", tweetPQ("p.js-tweet-text").text()) \ .replace('# ', '#').replace('@ ', '@').replace('$ ', '$') tweet.retweets = int( tweetPQ("span.ProfileTweet-action--retweet span.ProfileTweet-actionCount").attr( "data-tweet-stat-count").replace(",", "")) tweet.favorites = int( tweetPQ("span.ProfileTweet-action--favorite span.ProfileTweet-actionCount").attr( "data-tweet-stat-count").replace(",", "")) tweet.replies = int(tweetPQ("span.ProfileTweet-action--reply span.ProfileTweet-actionCount").attr( "data-tweet-stat-count").replace(",", "")) tweet.id = tweetPQ.attr("data-tweet-id") tweet.permalink = 'https://twitter.com' + tweetPQ.attr("data-permalink-path") tweet.author_id = int(tweetPQ("a.js-user-profile-link").attr("data-user-id")) dateSec = int(tweetPQ("small.time span.js-short-timestamp").attr("data-time")) tweet.date = datetime.datetime.fromtimestamp(dateSec, tz=datetime.timezone.utc) tweet.formatted_date = datetime.datetime.fromtimestamp(dateSec, tz=datetime.timezone.utc) \ .strftime("%a %b %d %X +0000 %Y") tweet.mentions = " ".join(re.compile('(@\\w*)').findall(tweet.text)) tweet.hashtags = " ".join(re.compile('(#\\w*)').findall(tweet.text)) geoSpan = tweetPQ('span.Tweet-geo') if len(geoSpan) > 0: tweet.geo = geoSpan.attr('title') else: tweet.geo = '' urls = [] for link in tweetPQ("a"): try: urls.append((link.attrib["data-expanded-url"])) except KeyError: pass tweet.urls = ",".join(urls) results.append(tweet) resultsAux.append(tweet) if receiveBuffer and len(resultsAux) >= bufferLength: receiveBuffer(resultsAux) resultsAux = [] batch_cnt_results += 1 if tweetCriteria.maxTweets > 0 and batch_cnt_results >= tweetCriteria.maxTweets: active = False break if receiveBuffer and len(resultsAux) > 0: receiveBuffer(resultsAux) resultsAux = [] return results
def update_tweet_info(session, tw): entities = tw.entities.copy() if hasattr(tw, 'extended_entities'): for (k, v) in tw.extended_entities.items(): entities[k] = v update_user_info(session, tw.user) if hasattr(tw, 'quoted_status'): quoted_status = tw.quoted_status if type(quoted_status) == dict: quoted_status = tweepy.Status.parse(api, quoted_status) update_tweet_info(session, quoted_status) if hasattr(tw, 'retweeted_status'): update_tweet_info(session, tw.retweeted_status) tw_db = session.query(models.Tweet)\ .options(load_only("id"))\ .filter_by(id=int(tw.id_str))\ .one_or_none() if tw_db is None: tw_db = models.Tweet(id=int(tw.id_str)) session.add(tw_db) if tw.coordinates is not None: tw_db.coordinates_longitude = tw.coordinates['coordinates'][0] tw_db.coordinates_latitude = tw.coordinates['coordinates'][1] else: tw_db.coordinates_longitude = None tw_db.coordinates_latitude = None tw_db.created_at = tw.created_at if hasattr(tw, 'current_user_retweet'): tw_db.current_user_retweet = \ int_or_None(tw.current_user_retweet['id_str']) else: tw_db.current_user_retweet = None tw_db.favorite_count = tw.favorite_count tw_db.favorited = tw.favorited tw_db.filter_level = getattr(tw, 'filter_level', None) tw_db.in_reply_to_screen_name = tw.in_reply_to_screen_name tw_db.in_reply_to_status_id = int_or_None(tw.in_reply_to_status_id_str) tw_db.in_reply_to_user_id = int_or_None(tw.in_reply_to_user_id_str) tw_db.lang = tw.lang if hasattr(tw, 'place') and tw.place is not None: place = {} for k in [ 'attributes', 'country', 'code', 'country_code', 'full_name', 'id', 'name', 'place_type', 'url' ]: if hasattr(tw.place, k): place[k] = getattr(tw.place, k) place['bounding_box'] = {} place['bounding_box']['coordinates'] = \ tw.place.bounding_box.coordinates place['bounding_box']['type'] = \ tw.place.bounding_box.type tw_db.place = json.dumps(place) else: tw_db.place = None tw_db.possibly_sensitive = getattr(tw, 'possibly_sensitive', None) tw_db.quoted_status_id = \ int_or_None(getattr(tw, 'quoted_status_id_str', None)) if hasattr(tw, 'scopes') and tw.scopes is not None: tw_db.scopes = json.dumps(tw.scopes) else: tw_db.scopes = None tw_db.retweet_count = tw.retweet_count tw_db.retweeted = tw.retweeted if hasattr(tw, 'retweeted_status'): tw_db.retweeted_status_id = int_or_None(tw.retweeted_status.id_str) else: tw_db.retweeted_status_id = None tw_db.source = tw.source tw_db.source_url = tw.source_url tw_db.text = tw.text tw_db.truncated = tw.truncated tw_db.user_id = int_or_None(tw.user.id_str) if hasattr(tw, 'withheld_copyright'): tw_db.withheld_copyright = tw.withheld_copyright else: tw_db.withheld_copyright = None if hasattr(tw, 'withheld_in_countries'): tw_db.withheld_in_countries = tw.withheld_in_countries else: tw_db.withheld_in_countries = None if hasattr(tw, 'withheld_scope'): tw_db.withheld_scope = tw.withheld_scope else: tw_db.withheld_scope = None session.commit() if not hasattr(tw, 'retweeted_status'): for m in entities.get('media', []): update_media_info(session, tw, m) for ht in entities.get('hashtags', []): tweet_id = int(tw.id_str) indices_begin = ht['indices'][0] indices_end = ht['indices'][1] ht_db = session.query(models.TweetHashtag)\ .options(load_only("tweet_id", "indices_begin", "indices_end"))\ .filter_by(tweet_id=tweet_id, indices_begin=indices_begin, indices_end=indices_end)\ .one_or_none() if ht_db is None: ht_db = models.TweetHashtag(tweet_id=int(tw.id_str), indices_begin=indices_begin, indices_end=indices_end) session.add(ht_db) ht_db.text = ht['text'] session.commit() for url in entities.get('urls', []): tweet_id = int(tw.id_str) indices_begin = url['indices'][0] indices_end = url['indices'][1] url_db = session.query(models.TweetUrl)\ .options(load_only("tweet_id", "indices_begin", "indices_end"))\ .filter_by(tweet_id=tweet_id, indices_begin=indices_begin, indices_end=indices_end)\ .one_or_none() if url_db is None: url_db = models.TweetUrl(tweet_id=int(tw.id_str), indices_begin=indices_begin, indices_end=indices_end) session.add(url_db) url_db.url = url['url'] url_db.display_url = url['display_url'] url_db.expanded_url = url['expanded_url'] session.commit() for sym in entities.get('symbols', []): tweet_id = int(tw.id_str) indices_begin = sym['indices'][0] indices_end = sym['indices'][1] sym_db = session.query(models.TweetSymbol)\ .options(load_only("tweet_id", "indices_begin", "indices_end"))\ .filter_by(tweet_id=tweet_id, indices_begin=indices_begin, indices_end=indices_end)\ .one_or_none() if sym_db is None: sym_db = models.TweetSymbol(tweet_id=int(tw.id_str), indices_begin=indices_begin, indices_end=indices_end) session.add(sym_db) sym_db.text = sym['text'] session.commit() for um in entities.get('user_mentions', []): tweet_id = int(tw.id_str) indices_begin = um['indices'][0] indices_end = um['indices'][1] um_db = session.query(models.TweetUserMention)\ .options(load_only("tweet_id", "indices_begin", "indices_end"))\ .filter_by(tweet_id=tweet_id, indices_begin=indices_begin, indices_end=indices_end)\ .one_or_none() if um_db is None: um_db = models.TweetUserMention(tweet_id=int(tw.id_str), indices_begin=indices_begin, indices_end=indices_end) session.add(um_db) um_db.user_id = int(um['id_str']) um_db.screen_name = um['screen_name'] um_db.name = um['name'] session.commit()