def get_user_tokens(user): tweets_csv = CSV.format(user) if not os.path.isfile(tweets_csv): get_all_tweets(user) words = [] with open(tweets_csv) as csvfile: reader = csv.DictReader(csvfile) for row in reader: for w in row['text'].lower().split(): words.append(w) return tokenize_text(words)
def downloader_function(thing, thing2): date = (datetime.datetime.utcnow() - datetime.datetime(1970, 1, 1)).days people = [ "bijanmustard", "7e5h", "dadurath", "sagebeans", "spinebashed", "theonion", "clickhole" ] who = people[date % len(people)] tweet_dumper.get_all_tweets(who) delete_blob("twitter_bot_bucket", f"{who}-clean.txt") upload_blob("twitter_bot_bucket", f"/tmp/{who}-clean.txt", f"{who}-clean.txt")
def download_tweets(): auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_key, access_secret) right_wing = [ "naftalibennett", "ronisassover", "bezalelsm", "gWBnMgXPN6kCe6a", "Likud_Party", "erane67", "Ayelet__Shaked", "netanyahu", "oren_haz", "bezalelsm", "YehudahGlick", "dudiamsalem", "giladerdan1", "gidonsaar", "KahlonMoshe", "YinonMagal", "ErelSegal", "davidbitan", "moshefeiglin", "NirBarkat", ] left_wing = [ "zehavagalon", "tamarzandberg", "MeravMichaeli", "MeravMichaeli", "Syechimovich", "mkgilon", "GabbayAvi", "ishmuli", "cabel_eitan", "EldadYaniv", "yarivop", "amirperetz", "StavShaffir", "dovhanin", "Isaac_Herzog", "NitzanHorowitz", "machanetzioni" ] print(len(right_wing)) print(len(left_wing)) # nafatali_tweets = get_all_tweets2("naftalibennett", auth, 1) # 0 - left wing, 1 - right wing [get_all_tweets(screen_name, auth, 0) for screen_name in left_wing] [get_all_tweets(screen_name, auth, 1) for screen_name in right_wing]
fileName, 'wb') as out_file: shutil.copyfileobj(response, out_file) except urllib.error.HTTPError: sys.stdout.flush() print( "\r %s is not a downloadable image. Skipping to next url..." % url) fileNumber += 1 sys.stdout.write("\r\nDone!") sys.stdout.flush() sys.stdout.write("\r\n") if __name__ == '__main__': # Enter your username in place of mine! userName = "******" tweet_dumper.get_all_tweets(userName) listOfPictureUrls = [] get_urls(listOfPictureUrls, userName) create_directory() os.chdir("./imagesFromTweets") print("downloading!") download_images(listOfPictureUrls)
from tweet_dumper import get_all_tweets import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import PCA from sklearn.linear_model import LogisticRegression from sklearn.metrics import classification_report from sklearn.model_selection import train_test_split #Lets create some variables that we can easily change to impact the project account1 = "realDonaldTrump" account2 = "HillaryClinton" #Lets begin by fetching our data! This is fairly easy, just pass in the two account names and we get lists in this form: #[tweet1,twee2,tweet3] where the tweets are the tweets from the acount. We want all the data in one list, so we append to the first list the second time we call get_all_tweets (we actually do this in the next section) data = get_all_tweets(account1) data2 = (get_all_tweets(account2)) #Lets also make a list of labels for the tweets. Our labels will be the account names. Though these labels are in a separate list, we want data[i] to be a tweet that was written by labels[i]. Basically keep the labels in the same order as the tweets. labels = [account1] * len(data) labels2 = [account2] * len(data2) labels.extend(labels2) data.extend(data2) #So now we have our ordered data with labels. We now need to do 2 things: first jumble the data so its uniformly distributed. And second we now need to create a training set to make our classifier and a test set to determine the accuracy. For simplicity's sake, lets split our data and label lists into 4 new lists: trainingData, trainingLabels, testData, testLabels. For this example, we will use 75% of the data as train, and 25% as test. Thankfully a function exists for exactly this purpose! trainingData, testData, trainingLabels, testLabels = train_test_split( data, labels, test_size=0.25) print(trainingData) #Yay! Now we have our data! But we should probably clean it up. Effectively, this is preparing the data to get features out of it. So since we're dealing with words, we want to do something called tokenizing. This takes our string which is a tweet and makes it a list of all of the words in the tweet. So instead of "This is a test", we have something like ["this","is","a","test"]. This step also does things like removing some punctuation, and is intelligent enough to keep places like New York as one token instead of two. There would typically be code here for what we are calling data preprocessing, but thankfully the function in the next step handles it for us. #Finally we're at the point where we can start the training process. The first thing we do is extract features from our data. There are a lot of different features we can extract, but one of the simplest is bag of words. As you already know, bag of words is just the frequency of every word in every tweet. So if your dataset was [the quick fox][the red apple], you could represent each of these datapoints by their feature vectors. In our case, [the quick fox] would turn into [1 1 1 0 0] and [the red apple] would become [1 0 0 1 1]. This is a bit unclear so maybe it makes sense to look at it in this way. First we extract all words from our training data and build a vocabulary. In the above example, the vocabulary would be [the,quick,fox,red,apple]. Lets make these the titles of columns in our feature matrix (shown below). Then lets create the feature matrix by taking each datapoint and marking down the number of times we see each word. Our above example would turn into this: # the quick fox red apple
# -*- coding: utf-8 -*- """ Created on Wed Mar 29 00:12:29 2017 @author: scram """ import tweet_dumper tweet_dumper.get_all_tweets("@realdonaldtrump")
def getTweetHistory(): createTweetTable() cur = getTwitter() for name in cur.fetchall(): get_all_tweets(name)