Пример #1
0
import tweepy
import os
import networkx as nx
import matplotlib.pyplot as plt

# auth to twitter
auth = tweepy.AppAuthHandler('xxxxxxxxxxxxx', 'xxxxxxxxxxx')
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)


def getIndex(str, alist):
    for i in range(len(alist)):
        if alist[i] == str:
            return i
    return -99


fromuser = raw_input('Enter the username to start from: ')
target = raw_input('Enter the target username: ')
cursor = fromuser
found = False
G = nx.Graph()
nameslist = [fromuser]

while found == False:
    try:
        print cursor
        for follower in tweepy.Cursor(api.followers,
                                      screen_name=cursor).items():
            if follower.screen_name not in nameslist:
                G.add_edge(cursor, follower.screen_name)
Пример #2
0
import codecs
import tweepy
import time
import sys
import csv
import json
from tweepy import OAuthHandler
import nltk
import re

#required tokens for working with data from twitter using Tweepy
access_token = "103589925-PYiNRi6sAoSAFCau7Q5zDAqF7Kt8WwsK5EunWL3I"
access_token_secret = "u8N1nS93eN5npmtBOAxCwJgZE0W4wPCNe1CEuCB9lEIys"
consumer_key = "IIFBxSZv8YnhRJuvvDJVkR4ht"
consumer_secret = "zou3XOVMDXp9esingDNEowUeEPmTKkY4daZGYdmalovyd9JCxr"
auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
#Authenticating with given tokens
if (not api):
    print("Can't Authenticate")
    sys.exit(-1)


def twitter_miner(query):
    tweetCount = 0
    maxTweets = 200
    neg = 0
    pos = 0
    cnt = 0
    file1 = open('conf.txt', 'r', encoding="utf-8")
    file2 = open('ou.txt', 'w', encoding="utf-8")
import tweepy
# Define ckey and csecret in keys.py with your application's key and secret.
from keys import *
import numpy as np
import pandas as pd

auth = tweepy.AppAuthHandler(ckey, csecret)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

if (not api):
    print("Can't Authenticate")
    sys.exit(-1)

import sys
import os

searchQuery = '#impostersyndrome OR #impostorsyndrome OR imposter syndrome OR impostor syndrome'  # this is what we're searching for
maxTweets = 10000000  # Some arbitrary large number
tweetsPerQry = 100  # this is the max the API permits
fName = 'tweets.csv'  # We'll store the tweets in a text file.

# If results from a specific ID onwards are reqd, set since_id to that ID.
# else default to no lower limit, go as far back as API allows
# sinceId = None  ## commented out by abc

## abc: 10/16/18: new code for incremental search of tweets to be added
## to a possibly existing file tweets.csv
import csv

exists = os.path.isfile(fName)
if exists:
from config import tw_key, tw_secret, aws_key, aws_secret
import tweepy
import numpy as np
import boto3
from random import randint
import time
import json

RUNTIME = 1.95 * 60 * 60 * 24  #1.95 days in seconds

auth = auth = tweepy.AppAuthHandler(tw_key, tw_secret)
api = tweepy.API(auth)
has_data_file = "data/has_data"
save_file = "data/data_store.json"
twitterID = np.dtype([("id", np.uint32), ("followers", np.uint32),
                      ("friends", np.uint32)])

filename = "data/users_data"
file = open(filename, "r")
users_data = np.fromfile(file, twitterID, count=-1)
file.close()
numUsers = len(users_data)

try:
    file = open(has_data_file, "r")
    has_data = list(np.fromfile(file, np.uint32, count=-1))
    file.close()
except:
    has_data = []

#client = boto3.client('dynamodb', region_name='us-west-2')
Пример #5
0

client = MongoClient('localhost', 27017)
fake = 'nameDB'
real = 'nameDB'
mailto="emailAdress"
db = client[fake]

RTthreshold=500
mongoThresholdPerDoc=300000 #(actual 600000, but just in case of crashing we have 500K)

ACCESS_TOKEN = sys.argv[3]
ACCESS_SECRET = sys.argv[4]
CONSUMER_KEY = sys.argv[1]
CONSUMER_SECRET = sys.argv[2]
auth = tweepy.AppAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)

api = tweepy.API(auth, retry_count=10, retry_delay=5, retry_errors=set([103]), wait_on_rate_limit=True,
                 wait_on_rate_limit_notify=True)


def get_followers(user_id, tweetnumber, retweeter_num, total_retweeters):
    users = []
    page_count = 0
    try:
        for i, user in enumerate(tweepy.Cursor(api.followers_ids, id=user_id, count=5000).pages()):#gets 5000 users per page
            sys.stdout.write('\r')
            sys.stdout.write(
                'Getting page %i for followers, for user: %i, for tweet number: %i, number of retweeters %i / %i' % (
                    i, user_id, tweetnumber, retweeter_num, total_retweeters))
            sys.stdout.flush()
Пример #6
0
# main config file
config = ConfigParser.ConfigParser()
config.read(os.path.dirname(os.path.abspath(
    __file__)) + '/bbs-tweet-wall.cfg')

# Twitter auth key (I like to keep seperate)
auth = ConfigParser.ConfigParser()
config.read(os.path.dirname(os.path.abspath(
    __file__)) + '/auth.cfg')

# get Twitter auth values from config
auth_key = config.get('auth', 'auth_key')
auth_secret = config.get('auth', 'auth_secret')

# setup Twtter oauth2 with values
auth = tweepy.AppAuthHandler(auth_key, auth_secret)
api = tweepy.API(auth)

# Get Twitter list count from config file, make it an integer
list_count = config.get('config', 'list_count')
list_count_int = int(list_count)

# Get format parameters from config
tweet_max = int(config.get('config', 'tweet_max'))
tweet_width = int(config.get('config', 'tweet_width'))
tweet_lines = int(config.get('config', 'tweet_lines'))
file_ext = config.get('config', 'file_ext')
output_dir = config.get('config', 'output_dir')
script_dir = config.get('config', 'script_dir')
bgFileName = config.get('config', 'bgFileName')
output_file = output_dir + '/' + 'tweets-all.' + file_ext
Пример #7
0
# Number of tweets per csv file
NUM_TWEETS_PER_FILE = 250

# Number of data csv files needed
MAX_NUM_DATA_FILES = 10000 / NUM_TWEETS_PER_FILE

# Tokens and secrets for authentication for the Twitter API
# These tokens were removed as they meant to be kept secret
consumer_token = ""
consumer_secret = ""

access_token = ""
access_token_secret = ""

# Authenticates with the Twitter API
auth = tweepy.AppAuthHandler(consumer_token, consumer_secret)
# auth.set_access_token(access_token, access_token_secret)

# Gets tweepy api wrapper
# Sleep for 15 minutes because of the Twitter API rate limit
# setting wait_on_rate_limit to True will make the program sleep automatically to avoid exceeding rate limits
api = tweepy.API(auth, wait_on_rate_limit_notify=True, wait_on_rate_limit=True)

# Stores hashes of all received tweets in a set to ensure there are no duplicates
tweet_hash_set = set()

# Stores the file number we are currently working with
file_num = 0

# Stores number of tweets collected
tweet_count = 0
Пример #8
0
from datetime import datetime
from nltk.tokenize import word_tokenize
import datetime as dt
from shapely.geometry import Point, Polygon
import matplotlib.pyplot as plt

get_ipython().run_line_magic('matplotlib', 'inline')
nltk.download('punkt')
nltk.download('stopwords')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', -1)

app_key = ''
app_key_secret = ''
auth = tweepy.AppAuthHandler(app_key, app_key_secret)
api = tweepy.API(auth)

# Se leen los archivos.

# In[2]:

df = pd.DataFrame()
path = 'data'
filenames = os.listdir('data')

print(f'Reading {len(filenames)} files ...')
for i, filename in enumerate(filenames):
    df = df.append(pd.read_csv(path + '/' + filename),
                   ignore_index=True,
                   sort=True)
 def connect_api(self):
     """this function connects the api"""
     self.auth = tweepy.AppAuthHandler(self.consumer_key, self.consumer_secret)
     self.api = tweepy.API(self.auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
     if not self.api:
         print("Problem Connecting to API")
Пример #10
0
# Extracting Data from Twitter
#                       Pavan Narayanan, 2016

import sys
import jsonpickle
import tweepy

auth = tweepy.AppAuthHandler("GET YOUR OWN AUTHETICATION", "SAME GOES HERE")

api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

searchQuery = '"verizon"'
maxTweets = 10000000
tweetsPerQry = 200
fName = 'tweets_' + searchQuery + '.txt'
sinceId = None
max_id = -1L
tweetCount = 0

with open(fName, 'w') as f:
    while tweetCount < maxTweets:
        try:
            if max_id <= 0:
                if not sinceId:
                    new_tweets = api.search(q=searchQuery, count=tweetsPerQry)
                else:
                    new_tweets = api.search(q=searchQuery,
                                            count=tweetsPerQry,
                                            since_id=sinceId)
            else:
                if not sinceId:
Пример #11
0
def get_all_tweets(query1,query2):
	#Twitter only allows access to a users most recent 3240 tweets with this method

	#authorize twitter, initialize tweepy
    auth = tweepy.AppAuthHandler(consumer_key,consumer_secret)
    api = tweepy.API(auth, wait_on_rate_limit=True,wait_on_rate_limit_notify=True)

	#initialize a list to hold all the tweepy Tweets
    alltweets = []

	#make initial request for most recent tweets (200 is the maximum allowed count)
    new_tweets = api.search(q = query1,count=100)

	#save most recent tweets
    alltweets.extend(new_tweets)

	#save the id of the oldest tweet less one
    oldest = alltweets[-1].id - 1

    tweetCount = 100
	#keep grabbing tweets until there are no tweets left to grab
    while len(new_tweets) > 0 and tweetCount < maxTweets:

		#all subsiquent requests use the max_id param to prevent duplicates
        new_tweets = api.search(q = query1,count=50,max_id=oldest)

		#save most recent tweets
        alltweets.extend(new_tweets)

		#update the id of the oldest tweet less one
        oldest = alltweets[-1].id - 1

        print("...%s tweets downloaded so far" % (len(alltweets)))

        tweetCount+=len(new_tweets)


    #make initial request for most recent tweets (200 is the maximum allowed count)
    new_tweets = api.search(q = query2,count=100)

	#save most recent tweets
    alltweets.extend(new_tweets)

	#save the id of the oldest tweet less one
    oldest = alltweets[-1].id - 1

    tweetCount = 100
	#keep grabbing tweets until there are no tweets left to grab
    while len(new_tweets) > 0 and tweetCount < maxTweets:

		#all subsiquent requests use the max_id param to prevent duplicates
        new_tweets = api.search(q = query2,count=50,max_id=oldest)

		#save most recent tweets
        alltweets.extend(new_tweets)

		#update the id of the oldest tweet less one
        oldest = alltweets[-1].id - 1

        print("...%s tweets downloaded so far" % (len(alltweets)))

        tweetCount+=len(new_tweets)

	#transform the tweepy tweets into a 2D array that will populate the csv
    outtweets = [[tweet.text] for tweet in alltweets]

	#write the csv
    with open('../data/lalinbdg_tweets.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerows(outtweets)
    pass
Пример #12
0
def main():
    '''
    :param argv: argv[1]: which api will be chosen, streaming/search.
                 argv[2]: which city for this task, Melbourne or Sydney
    :return:
    '''
    # Initialization...
    api_type = sys.argv[1]
    city = sys.argv[2]

    if city == 'Melbourne':
        geo_circle_info = GEO_CIRCLE_INFO_MELB
        which_file_path = './tweet_id_melbourne.txt'
        db_collection = 'tweets_melbourne'
    elif city == 'Sydney':
        geo_circle_info = GEO_CIRCLE_INFO_SYDNEY
        which_file_path = './tweet_id_sydney.txt'
        db_collection = 'tweets_sydney'
    else:
        print(
            'You should choose either \'Melbourne\' or \'Sydney\' as the second para'
        )
        return 0

    # This handles Twitter authetification and the connection to Twitter Streaming API
    l = StdOutListener()

    auth1 = OAuthHandler(consumer_key, consumer_secret)
    auth1.set_access_token(access_token, access_token_secret)

    auth = tweepy.AppAuthHandler(API_KEY, API_SECRET)
    api = tweepy.API(auth,
                     wait_on_rate_limit=True,
                     wait_on_rate_limit_notify=True)

    tweet_count = 0

    screen_names = set()

    if api_type == 'search':
        coll = get_db(db_collection)
        max_id = get_max_id()
        if not max_id:
            print "This is your first time to invoke this script ^_^"
        else:
            print "The task will start from tweet NO." + str(max_id)

        with open(which_file_path, 'a') as f:
            time_stamp = time.strftime('%Y-%m-%d-%H-%M',
                                       time.localtime(time.time()))
            f.write(str(time_stamp) + '\n')

            while tweet_count < MAX_TWEETS:
                try:
                    if (max_id <= 0):
                        new_tweets = api.search(q="*",
                                                geocode=geo_circle_info,
                                                count=TWEETS_PER_QRY)
                    else:
                        new_tweets = api.search(q="*",
                                                geocode=geo_circle_info,
                                                count=TWEETS_PER_QRY,
                                                max_id=str(max_id - 1))
                    if not new_tweets:
                        print("No more tweets found")
                    oldest_id = new_tweets[-1].id
                    f.write(str(oldest_id) + '\n')
                    global count
                    for tweet in new_tweets:
                        json_str = json.dumps(tweet._json)
                        json_o = json.loads(json_str)
                        coll.insert(json_o)

                        new_screen_name = json_o['user']['screen_name']
                        if new_screen_name not in screen_names:
                            screen_names.add(new_screen_name)
                            get_all_tweets_by_screen_name(
                                new_screen_name, api, db_collection)

                        count += 1

                    tweet_count += len(new_tweets)
                    print("Download {0} tweets".format(tweet_count))

                except tweepy.TweepError as e:
                    print("ERROR FOUND: " + str(e))
            f.close()
    elif api_type == 'streaming':
        while True:
            try:
                stream = Stream(auth1, l)
                stream.filter(locations=GEOBOX_MELBOURNE)
            except Exception as e:
                print e
                continue
    elif api_type == 'username':
        last_name = get_last_name()
        if last_name == -1:
            print "No name history has been found, start from the first name ^_^"
        else:
            print "This task will start from name: [%s]" % last_name
        f_n_h = open(file_path_names_history, 'a')
        flag = False
        with open(file_path_names, 'r') as f:
            for screen_name in f.readlines():
                try:
                    if last_name == -1:
                        f_n_h.write(screen_name.strip() + '\n')
                        get_all_tweets_by_screen_name(screen_name.strip(), api)
                    elif not flag and last_name != -1:
                        if screen_name.strip() != last_name.strip():
                            pass
                        else:
                            flag = True
                            # f_n_h.write(screen_name.strip() + '\n')
                            # get_all_tweets_by_screen_name(screen_name.strip(), api)
                    else:
                        f_n_h.write(screen_name.strip() + '\n')
                        get_all_tweets_by_screen_name(screen_name.strip(), api)
                except Exception as e:
                    print e
                    continue
    else:
        print 'For the 1st para, neither \"streaming\" nor \"search\" has been correctly input'
Пример #13
0
import tweepy
import sys
import jsonpickle
import os

auth = tweepy.AppAuthHandler(
    "6cxqy2zGDG79IT0kzwl1VTZRt",
    "B11tACBDKtcpQ4jh9gRqmueIMxVqmkpwCYnYMEVJerqSR6eAyH")

api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

if (not api):
    print("Can't Authenticate")
    sys.exit(-1)


def get_tweet(searchQuery, maxTweets=200):

    tweetsPerQry = 100
    # Some arbitrary large number
    # If results from a specific ID onwards are reqd, set since_id to that ID.
    # else default to no lower limit, go as far back as API allows
    sinceId = None

    # If results only below a specific ID are, set max_id to that ID.
    # else default to no upper limit, start from the most recent tweet matching the search query.
    max_id = -1
    c = 0
    tweetCount = 0
    tweets = []
    tts = []
Пример #14
0
def calculatetweetsdemo(input_obtained, input_month):
    tweet_dict = {}
    name_not_entered = False
    print("called")
    if input_obtained == '':
        name_not_entered = True
    positive_count = 0
    negative_count = 0
    neutral_count = 0
    consumer_key = 'pBQ6uagJoN3eksDl55bzaSepf'
    consumer_secret = 'NEA8UFjkf7325FhKWba02kgQJWSKmQLhCrXzWYyyyaQEXICNic'
    auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)
    api = tweepy.API(auth, wait_on_rate_limit=True)
    tweet_list = []
    final_month = 0
    final_year = 2019

    def calculate_sentiment(tweet):
        test_tweet = TextBlob(cleantweet(tweet))
        if test_tweet.sentiment.polarity > 0:
            return 'positive'
        elif test_tweet.sentiment.polarity == 0:
            return 'neutral'
        else:
            return 'negative'

    def cleantweet(tweet):
        return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t]) | (\w+:\ / \ / \S+)", " ", tweet).split())

    def gettweets(final_month, final_year):
        nonlocal months_not_entered
        nonlocal error
        x = re.findall("\D", str(input_month))
        if x:
            error = True
            return
        if input_month == "":
            months_not_entered = True
            return

        entered_month = input_month
        nonlocal notweets
        print(entered_month)
        if entered_month == 0:
            notweets = True
        if entered_month > 24:
            error = True
            print(error)
            return
        todays_date = date.today()
        print(todays_date)
        todays_date = str(todays_date).split("-")
        current_month = int(todays_date[1])

        if entered_month < current_month:
            final_month = current_month - entered_month
        elif 4 - entered_month <= 0:
            quotient = int(entered_month / current_month)
            final_year = final_year - quotient
            entered_month = entered_month % current_month
            final_month = 12 - entered_month
        print(str(final_year) + " " + str(final_month))
        todays_day = todays_date[2]
        print(todays_day)
        nonlocal positive_count, neutral_count, negative_count
        repeated_count = 0
        tweet_counter = 0
        rootDir = os.path.dirname(os.path.abspath(__file__))
        fh = open(rootDir + "/" + input_obtained + str(months) + "tweets" + ".txt", "w", encoding="utf-8")
        for tweet in tweepy.Cursor(api.search, q="#" + input_obtained, lang="en", count = 200,
                                   since=str(final_year) + "-" + str(final_month) + "-"+todays_day).items():
            print("tweet " + str(tweet_counter) + " processed")
            sentiment_type = calculate_sentiment(tweet.text)
            tweets_no_repeat = {'text': tweet.text, 'sentiment': sentiment_type}
            if tweet.retweet_count > 0:
                if tweets_no_repeat not in tweet_list:
                    tweet_list.append(tweets_no_repeat)
                    fh.write(tweet.text.replace("\n","")+"--++=="+sentiment_type+"\n")
                    if sentiment_type == 'positive':
                        positive_count = positive_count + 1
                    elif sentiment_type == 'negative':
                        negative_count = negative_count + 1
                    else:
                        neutral_count = neutral_count + 1
                    print(tweet.text)
                else:
                    if sentiment_type == 'positive':
                        positive_count = positive_count + 1
                    elif sentiment_type == 'negative':
                        negative_count = negative_count + 1
                    else:
                        neutral_count = neutral_count + 1
                    repeated_count += 1

            elif tweet.retweet_count == 0:
                tweet_list.append(tweets_no_repeat)
                fh.write(tweet.text.replace("\n", "") + "--++==" + sentiment_type+"\n")
                if sentiment_type == 'positive':
                    positive_count = positive_count + 1
                elif sentiment_type == 'negative':
                    negative_count = negative_count + 1
                else:
                    neutral_count = neutral_count + 1
                print(tweet.text)
            tweet_counter += 1
        fh.close()
        total_tweets = positive_count + neutral_count + negative_count
        if total_tweets == 0:
            notweets = True
        print(str(positive_count) + " " + str(negative_count) + " " + str(neutral_count))
        print(" " + str(total_tweets))
        print("repeated ", repeated_count)
        print(len(tweet_list))
        nonlocal tweet_dict
        tweet_dict = {'Tweets': tweet_list}
        df = pandas.DataFrame.from_dict(tweet_dict)
        rootDir = os.path.dirname(os.path.abspath(__file__))
        df.to_csv(rootDir + "/" + input_obtained + str(months) + "tweets" + ".csv", index=False)
    error = False
    notweets = False
    months_not_entered = False
    gettweets(final_month, final_year)
    print(error)
    if error:
        return "invalid input"
    if notweets:
        return "no tweets"
    if months_not_entered and name_not_entered:
        return "name and months empty"
    if months_not_entered:
        return "months empty"
    if name_not_entered:
        return "name empty"
    if not error and not notweets:
        total_count = positive_count + neutral_count + negative_count
        if total_count == 0:
            return "total count zero"

        return str(positive_count)+","+str(negative_count)+","+str(neutral_count)
Пример #15
0
def get_tweets(request):
    if request.POST:

        import tweepy, sys, jsonpickle

        consumer_key = 'K6V0CCCWRTrmk582ETAepQ77q'
        consumer_secret = 'T9q1U8hO6AKG8znSbsvjR7gu6eCGvxR6d1S4KJjCrPI8vvzndz'

        qry = '@Telkomsel AND (pulsa OR sinyal OR harga OR kualitas OR kuota OR internet OR jaringan OR pelayanan)'
        maxTweets = 1000  # Isi sembarang nilai sesuai kebutuhan anda
        tweetsPerQry = 100  # Jangan isi lebih dari 100, ndak boleh oleh Twitter
        t = datetime.now()
        formatted_time = t.strftime('%d-%m-%y %H.%M')
        fname = 'Tweets_' + formatted_time

        auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)
        api = tweepy.API(auth,
                         wait_on_rate_limit=True,
                         wait_on_rate_limit_notify=True)
        if (not api):
            sys.exit(
                'Autentikasi gagal, mohon cek "Consumer Key" & "Consumer Secret" Twitter anda'
            )

        sinceId = None
        max_id = -1
        tweetCount = 0

        with open(fname + '.json', 'w') as f:
            while tweetCount < maxTweets:
                try:
                    if (max_id <= 0):
                        if (not sinceId):
                            new_tweets = api.search(q=qry,
                                                    count=tweetsPerQry,
                                                    tweet_mode='extended')
                        else:
                            new_tweets = api.search(q=qry,
                                                    count=tweetsPerQry,
                                                    since_id=sinceId,
                                                    tweet_mode='extended')
                    else:
                        if (not sinceId):
                            new_tweets = api.search(q=qry,
                                                    count=tweetsPerQry,
                                                    max_id=str(max_id - 1),
                                                    tweet_mode='extended')
                        else:
                            new_tweets = api.search(q=qry,
                                                    count=tweetsPerQry,
                                                    max_id=str(max_id - 1),
                                                    since_id=sinceId,
                                                    tweet_mode='extended')
                    if not new_tweets:
                        print(
                            'Tidak ada lagi Tweet ditemukan dengan Query="{0}"'
                            .format(qry))
                        break
                    for tweet in new_tweets:
                        if (tweet._json['user']["name"] != "Telkomsel"
                                and "?" not in tweet._json["full_text"] and
                                tweet._json['metadata']["iso_language_code"]
                                == "in"):
                            f.write(
                                jsonpickle.encode(tweet._json,
                                                  unpicklable=False) + '\n')
                            # text = tweet._json["full_text"]
                            # text = re.sub(r"(?:\|https?| https?|https? \://)\S+", "", text)
                            # character = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p',
                            #              'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
                            # for i in range(len(character)):
                            #     charac_long = 5
                            #     while charac_long >= 2:
                            #         char = character[i] * charac_long
                            #         text = text.replace(char, character[i])
                            #         charac_long -= 1
                            # text = ' '.join(word.strip(string.punctuation) for word in text.split())

                    tweetCount += len(new_tweets)
                    max_id = new_tweets[-1].id
                except tweepy.TweepError as e:
                    print("some error : " + str(e))
                    break
        """messages.add_message(request, messages.INFO, 'Tweets telah tersimpan pada filename: {1}'.format(tweetCount, fname))
        messages.add_message(request, messages.INFO, 'Jumlah Tweets telah tersimpan: %.0f' % tweetCount)"""
        fo = open(fname + '.json', 'r')
        fw = open(fname + '.txt', 'w')

        for line in fo:
            try:
                tweet = json.loads(line)
                text = ' '.join(
                    word.strip(string.punctuation)
                    for word in tweet['full_text'].split())
                text = re.sub(r"(?:\@ | @|@|https?| https?|https? \://)\S+",
                              "", text)
                character = [
                    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
                    'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
                    'y', 'z', '.'
                ]
                for i in range(len(character)):
                    charac_long = 5
                    while charac_long >= 2:
                        char = character[i] * charac_long
                        text = text.replace(char, character[i])
                        charac_long -= 1
                #text = ' '.join(word.strip(string.punctuation) for word in text.split())
                text = re.sub(r"\n", "", text)
                fw.write(text + "\n")
            except:
                continue

        import nltk

        testing_data = '/Users/achmed/Documents/data/testing.txt'
        pos_words = '/Users/achmed/PycharmProjects/positive.txt'
        neg_words = '/Users/achmed/PycharmProjects/negative.txt'

        def read_opinion_lexicon(fileName):
            dataFile = open(fileName, "r")
            word_set = set()
            for line in dataFile:
                line = line.strip()
                if line not in word_set:
                    word_set.add(line)
            dataFile.close()
            return word_set

        positive_words = read_opinion_lexicon(pos_words)
        negative_words = read_opinion_lexicon(neg_words)

        fo = open(fname + '.txt', 'r')
        Result.objects.all().delete()
        global sentpos_count, sentneg_count, sentnet_count
        sentpos_count = 0
        sentneg_count = 0
        sentnet_count = 0
        negation_words = [
            'tidak', 'bukan', 'gak', 'enggak', 'belum', 'ga', 'tdk', 'tak',
            'malah'
        ]

        for line in fo:
            sentence = line
            kalimat = sentence.split()
            doc_words = [w.lower() for w in kalimat]
            pos_count = 0
            neg_count = 0

            for w in doc_words:
                if w in positive_words:
                    if (doc_words[doc_words.index(w) - 1] in negation_words):
                        neg_count -= 1
                    else:
                        pos_count += 1
                if w in negative_words:
                    if (doc_words[doc_words.index(w) - 1] in negation_words):
                        pos_count += 1
                    else:
                        neg_count -= 1
            sum = pos_count + neg_count

            if (sum == 0):
                classify_result = "netral"
                sentnet_count += 1
                pos_score = 0.5
                neg_score = 0.5
            else:
                pos_score = pos_count / sum
                neg_score = neg_count / sum

                if (pos_score > neg_score):
                    classify_result = "positif"
                    sentpos_count += 1
                else:
                    classify_result = "negatif"
                    sentneg_count += 1

            sentiment2 = Result(sentiment=sentence,
                                classification=classify_result)
            sentiment2.save()

        #classifier.show_most_informative_features()

        #test_corpus, _ = load_corpus(testing_data)
        #test_set_features = [(doc_features(d), c) for (d, c) in test_corpus]

        #print(nltk.classify.accuracy(classifier, test_set_features))

    return render(request, 'sentiment_analysis.html',
                  {'obj': Result.objects.all()})
Пример #16
0
 def __init__(self) -> None:
     self.api_key = os.environ.get('TWITTER_API_KEY')
     self.api_secret = os.environ.get('TWITTER_API_SECRET')
     self.api_bearer = os.environ.get('TWITTER_API_BEARER')
     self.auth = tweepy.AppAuthHandler(self.api_key, self.api_secret)
     self.api = tweepy.API(self.auth)
Пример #17
0
# Twitter authentication part
# Replace the API_KEY and API_SECRET with your application's key and secret. from http://apps.twitter.com
# put your API_KEY and API_SECRET in twitappkeys.dat file which you will place in dirapikeys directory
# specified one line bellow in first and second line
dirapikeys = '/data/twitter/'
with open(dirapikeys + 'twitappkeys.txt') as fkeys:
    lfkeys = fkeys.readlines()
    API_KEY = lfkeys[0].strip()
    API_SECRET = lfkeys[1].strip()
    
maxTweets = 10000 # You can set this number to what ever you like value, 
# but keep in mind that twitter keeps records for searches publicly available via API to up to last 7 days

tweetsPerQry = 100 # do not change this number

auth = tweepy.AppAuthHandler(API_KEY, API_SECRET)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

if (not api):
    print ("Can't Authenticate")
    logging.debug("Can't Authenticate")
    sys.exit(-1)

# Advice: don't change sinceId and max_id values
# If results from a specific ID onwards set since_id to that ID.
# else default to no lower limit sinceId = None, go as far back as API allows
sinceId = None
# If results only below a specific ID are, set max_id to that ID.
# else default to no upper limit max_id = -1, start from the most recent tweet matching the search query.
max_id = -1
 def __init__(self, key, secret):
     self.auth = tweepy.AppAuthHandler(key, secret)
     self.api = tweepy.API(self.auth)
Пример #19
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Mar 20 20:39:36 2021
@author: Ajit Johnson Nirmal
Twitter Post Finder
"""

# Lib
import tweepy
import datetime
import pandas as pd

# Authentication
auth = tweepy.AppAuthHandler(
    'VhVYh9RVGivvFanwDr7kg8qBt',
    'oXG5rJDopPVgThiq6xEU3fKNonpkDTFMvazQjSvgkw66NeuVsf')

# test
api = tweepy.API(auth)

# Run Function
pages = [
    'NewsfromScience', 'ScienceNews', 'newscientist', 'Immunoscope', 'nature',
    'NatureMedicine', 'nresearchnews', 'natBME', 'NatureBiotech',
    'NatureComms', 'NatImmunol', 'NatureNews', 'naturemethods',
    'NatureHumBehav', 'NatureAstronomy', 'NatureGenet', 'NatureNV',
    'NatureClimate', 'NatureCellBio', 'CellCellPress', 'TrendsCellBio',
    'Dev_Cell', 'CellSystemsCP', 'CellPressNews', 'TrendsMolecMed',
    'TrendsImmuno', 'MolecularCell', 'CellStemCell', 'NeuroCellPress',
    'Cancer_Cell', 'cellhostmicrobe', 'JCellBiol', 'NewsfromScience',
Пример #20
0
import csv
import itertools
import tweepy
import os
from dotenv import load_dotenv

# Load environement variables containing Twitter API keys
load_dotenv()

# Authorize Tweepy API
auth = tweepy.AppAuthHandler(os.getenv('CONSUMER_KEY'),
                             os.getenv('CONSUMER_SECRET_KEY'))
api = tweepy.API(auth)

# Generate the search query based on all combinations of these phrases
names = ['Adia Barnes', "Arizona coach"]
modifiers = ['mother', 'breastfeeding', 'pumping', 'nursing', 'normalize']
query = ' OR '.join(
    map(lambda x: x[0] + ' ' + x[1], itertools.product(names, modifiers)))

# Open a file for writing the tweets to
with open('tweets.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    writer.writerow(
        ['text', 'id', 'created', 'is_retweet', 'favorites', 'retweets'])

    for tweet in tweepy.Cursor(api.search, q=query, lang='en-us').items():
        writer.writerow([
            tweet.text.encode(encoding='UTF-8',
                              errors='strict'), tweet.id, tweet.created_at,
            tweet.text.startswith('RT @'), tweet.favorite_count,
Пример #21
0
 def __init__(self):
     self.nlp = spacy.load('en_core_web_lg')
     self.contraction_mapping = {
         "ain't": "is not",
         "aren't": "are not",
         "can't": "cannot",
         "can't've": "cannot have",
         "'cause": "because",
         "could've": "could have",
         "couldn't": "could not",
         "couldn't've": "could not have",
         "didn't": "did not",
         "doesn't": "does not",
         "don't": "do not",
         "hadn't": "had not",
         "hadn't've": "had not have",
         "hasn't": "has not",
         "haven't": "have not",
         "he'd": "he would",
         "he'd've": "he would have",
         "he'll": "he will",
         "he'll've": "he will have",
         "he's": "he is",
         "how'd": "how did",
         "how'd'y": "how do you",
         "how'll": "how will",
         "how's": "how is",
         "I'd": "I would",
         "I'd've": "I would have",
         "I'll": "I will",
         "I'll've": "I will have",
         "I'm": "I am",
         "I've": "I have",
         "i'd": "i would",
         "i'd've": "i would have",
         "i'll": "i will",
         "i'll've": "i will have",
         "i'm": "i am",
         "i've": "i have",
         "isn't": "is not",
         "it'd": "it would",
         "it'd've": "it would have",
         "it'll": "it will",
         "it'll've": "it will have",
         "it's": "it is",
         "let's": "let us",
         "ma'am": "madam",
         "mayn't": "may not",
         "might've": "might have",
         "mightn't": "might not",
         "mightn't've": "might not have",
         "must've": "must have",
         "mustn't": "must not",
         "mustn't've": "must not have",
         "needn't": "need not",
         "needn't've": "need not have",
         "o'clock": "of the clock",
         "oughtn't": "ought not",
         "oughtn't've": "ought not have",
         "shan't": "shall not",
         "sha'n't": "shall not",
         "shan't've": "shall not have",
         "she'd": "she would",
         "she'd've": "she would have",
         "she'll": "she will",
         "she'll've": "she will have",
         "she's": "she is",
         "should've": "should have",
         "shouldn't": "should not",
         "shouldn't've": "should not have",
         "so've": "so have",
         "so's": "so as",
         "this's": "this is",
         "that'd": "that would",
         "that'd've": "that would have",
         "that's": "that is",
         "there'd": "there would",
         "there'd've": "there would have",
         "there's": "there is",
         "here's": "here is",
         "they'd": "they would",
         "they'd've": "they would have",
         "they'll": "they will",
         "they'll've": "they will have",
         "they're": "they are",
         "they've": "they have",
         "to've": "to have",
         "wasn't": "was not",
         "we'd": "we would",
         "we'd've": "we would have",
         "we'll": "we will",
         "we'll've": "we will have",
         "we're": "we are",
         "we've": "we have",
         "weren't": "were not",
         "what'll": "what will",
         "what'll've": "what will have",
         "what're": "what are",
         "what's": "what is",
         "what've": "what have",
         "when's": "when is",
         "when've": "when have",
         "where'd": "where did",
         "where's": "where is",
         "where've": "where have",
         "who'll": "who will",
         "who'll've": "who will have",
         "who's": "who is",
         "who've": "who have",
         "why's": "why is",
         "why've": "why have",
         "will've": "will have",
         "won't": "will not",
         "won't've": "will not have",
         "would've": "would have",
         "wouldn't": "would not",
         "wouldn't've": "would not have",
         "y'all": "you all",
         "y'all'd": "you all would",
         "y'all'd've": "you all would have",
         "y'all're": "you all are",
         "y'all've": "you all have",
         "you'd": "you would",
         "you'd've": "you would have",
         "you'll": "you will",
         "you'll've": "you will have",
         "you're": "you are",
         "you've": "you have"
     }
     self.keys = get_config_from_json('..//Keys//keys.json')
     self.auth = tw.AppAuthHandler(self.keys.twitter_keys.consumer_key,
                                   self.keys.twitter_keys.consumer_secret)
     self.api = tw.API(self.auth, wait_on_rate_limit=True)
     limit = self.api.rate_limit_status()
     limit = DotMap(limit)
     print(limit.resources.search)
Пример #22
0
def standard_search_crawler(max_tweets, area, tweet_file, user_list):
    auth = tweepy.AppAuthHandler(
        "ujqGAvWCv9C14893iDGd8aUfb",
        "7VQIxQ8cQJ0mQQwre9nRG0uJ2LEBRlHmOcPUpjHPHO9DmsoDTE")
    api = tweepy.API(auth,
                     wait_on_rate_limit=True,
                     wait_on_rate_limit_notify=True)
    if not api:
        print("Can't Authenticate")
    search_query = 'place:0ec0c4fcacbd0083'
    tweets_per_qry = 100  # this is the max the API permits
    tweet_count = 0
    print("Downloading max {0} tweets".format(max_tweets))
    with open(tweet_file, 'a', encoding='utf-8') as load_f:
        with open(user_list, 'a', encoding='utf-8') as load_f1:
            while tweet_count < max_tweets:
                new_tweets = api.search(q=search_query, count=tweets_per_qry)
                if not new_tweets:
                    print("No more tweets found")
                    break
                for tweet_status in new_tweets:
                    twi_dict = {}
                    tweet = tweet_status._json
                    twi_dict_x = None
                    twi_dict_y = None
                    twi_dict_t = None
                    if tweet.get('text'):
                        if tweet['lang'] == 'en':
                            twi_dict_t = tweet['text']
                    if tweet.get('coordinates'):
                        twi_dict_x = tweet['coordinates']['coordinates'][0]
                        twi_dict_y = tweet['coordinates']['coordinates'][1]
                    if tweet.get(
                            'geo'
                    ) and twi_dict_x is None and twi_dict_y is None:
                        twi_dict_x = tweet['geo']['coordinates'][1]
                        twi_dict_y = tweet['geo']['coordinates'][0]
                    if tweet.get('place'):
                        twi_dict_x = get_coord_by_box(
                            tweet['place']['bounding_box']['coordinates']
                            [0])[0]
                        twi_dict_y = get_coord_by_box(
                            tweet['place']['bounding_box']['coordinates']
                            [0])[1]
                    if twi_dict_x is not None and twi_dict_y is not None and twi_dict_t is not None and allocate_tweet(
                            twi_dict_x, twi_dict_y, area) is not None:
                        twi_dict["_id"] = tweet.get('id')
                        twi_dict["x_coord"] = twi_dict_x
                        twi_dict["y_coord"] = twi_dict_y
                        twi_dict["text"] = twi_dict_t
                        twi_dict["area"] = allocate_tweet(
                            twi_dict_x, twi_dict_y, area)
                        twi_dict['hashtags'] = tweet['entities']['hashtags']
                        twi_dict['time'] = tweet['created_at']
                        load_f.write(json.dumps(twi_dict) + "\n")
                        tweet_count += 1
                    line = {'id': tweet['user']['id_str']}
                    load_f1.write(json.dumps(line) + '\n')
            load_f1.close()
        load_f.close()
    notice = "Downloaded {0} tweets, Saved to {1}".format(tweet_count, 'disk')
    return notice
Пример #23
0
#  movienames='#moana OR #doctorstrange OR #allied OR #arrivalmovie OR #badsanta2 OR #almostchristmasmovie OR #assassinscreed  OR #collateralbeauty  OR #fantasticbeastsandwheretofindthem  OR #jackie  OR #lalaland  OR #passengers  OR #rogueonestarwarsstory  OR #sing'

# tagsToSearch = '#CambMA'
print(tagsToSearch)
#-----------------------------------------------------------------------
# load  API credentials
#-----------------------------------------------------------------------
config = {}
# execfile("config.py", config)
exec(compile(open("config.py", "r").read(), "config.py", 'exec'), config)

#-----------------------------------------------------------------------
# create twitter API object
#-----------------------------------------------------------------------
# Setup tweepy to authenticate with Twitter credentials:
auth = tweepy.AppAuthHandler(config["consumer_key"], config["consumer_secret"])
# auth.set_access_token(config["access_token"], config["access_token_secret"])

# Create the api to connect to twitter with your creadentials
api = tweepy.API(auth,
                 wait_on_rate_limit=True,
                 wait_on_rate_limit_notify=True,
                 compression=True)

# Create a producer to write json messages to kafka
#producer = KafkaProducer(bootstrap_servers=['kafka:9092'], # default is localhost:9092
#    value_serializer=lambda v: json.dumps(v).encode('utf-8'))
try:
    producer = KafkaProducer(
        bootstrap_servers=[
            "" + os.environ['HOSTNAME'] + ":" + os.environ['PORT'] + ""
Пример #24
0
def tweet_to_db(searchQuery,
                start,
                end,
                tweetsPerQry=100,
                maxTweets=100000000):
    """
    This function pulls tweets with hashtag searchQuery from a specified time period.
    tweetsPerQry = 100 is the maximal number of tweets allowed by Twitter per query.
    maxTweets is some arbitrary big number.
    Tweets are instantly inserted into an SQL database with one table 'tweets',
    which has the following 4 columns:
    id - unique identifier
    tweet_id - BIGINT, unique tweet id
    datetime - TEXT, datetime in format '%Y-%m-%d %H:%M:%S'
    content - TEXT, tweet content

    """
    # Obtain keys
    with open('/home/ubuntu/twitter_oauth.txt') as oauth:
        keys = oauth.readlines()
    consumer_key, consumer_secret, access_token = [x.strip() for x in keys]

    # Replace the API_KEY and API_SECRET with your application's key and secret.
    auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)

    api = tweepy.API(auth,
                     wait_on_rate_limit=True,
                     wait_on_rate_limit_notify=True)

    startSince = start.strftime("%Y-%m-%d")
    endUntil = (end + datetime.timedelta(days=1)).strftime("%Y-%m-%d")

    end_tweet = api.search(q=searchQuery, count=1, until=endUntil)[0]
    start_tweet = api.search(q=searchQuery, count=1, until=startSince)[0]
    tweetCount = 0

    # Identify ending id of tweets within timeframe using a binary search
    # This unfortunately is not available directly through API
    while end_tweet.created_at - end > datetime.timedelta(0, 5, 0):
        mid_id = int((end_tweet.id + start_tweet.id) / 2)
        # Grab 10 tweets just to make sure they are not all zeros
        mid_tweet = api.search(q=searchQuery, count=10, max_id=mid_id)[0]
        if end - mid_tweet.created_at > datetime.timedelta(0, 5, 0):
            start_tweet = mid_tweet
        else:
            end_tweet = mid_tweet

    max_id = end_tweet.id

    # Create db to store results
    with open('/home/ubuntu/rds_keys.txt') as rds_keys:
        keys = rds_keys.readlines()
    host, dbname, rds_user, rds_pw = [x.strip() for x in keys]

    con = psycopg2.connect(host=host,
                           dbname=dbname,
                           user=rds_user,
                           password=rds_pw,
                           port='5432')
    cur = con.cursor()

    create_table = """
    CREATE TABLE IF NOT EXISTS tweets (id SERIAL PRIMARY KEY, tweet_id BIGINT, hashtag TEXT, 
    datetime TEXT, content TEXT);
    """
    cur.execute(create_table)
    con.commit()

    cur.execute("SELECT tweet_id FROM tweets WHERE hashtag = %s",
                [searchQuery])
    if not cur.rowcount:
        unique_ids = set()
    else:
        unique_ids = set([x[0] for x in cur.fetchall()])

    insert_tweet = """
    INSERT INTO tweets(tweet_id, hashtag, datetime, content) VALUES (%s, %s, %s, %s)
    """

    while tweetCount < maxTweets:
        try:
            new_tweets = api.search(q=searchQuery,
                                    count=tweetsPerQry,
                                    lang='en',
                                    max_id=str(max_id - 1),
                                    since=startSince,
                                    until=endUntil)

            if not new_tweets:
                print("No more tweets found")
                break
            if new_tweets[-1].created_at < start:
                print("Exhausted time interval.")
                break
            for tweet in new_tweets:
                if tweet.id not in unique_ids:
                    unique_ids.add(tweet.id)
                    tweet_datetime = tweet.created_at
                    tweet_datetime_str = tweet_datetime.strftime(
                        '%Y-%m-%d %H:%M:%S')
                    tweet_list_insert = [
                        tweet.id, searchQuery, tweet_datetime_str,
                        unicode(tweet.text).encode('ascii', 'replace')
                    ]
                    cur.execute(insert_tweet, tweet_list_insert)
                con.commit()
            tweetCount += len(new_tweets)
            max_id = new_tweets[-1].id
        except tweepy.TweepError as e:
            time.sleep(180)
            continue
    print("Total number of tweets: %s", tweetCount)
Пример #25
0
                messages.append(message)
            with open('data.txt', 'a') as file:
                for m in messages:
                    pickle.dump(m, file, pickle.HIGHEST_PROTOCOL)
            tweetCount += len(new_tweets)
            print("Downloaded {0} tweets".format(tweetCount))
            max_id = new_tweets[-1].id
        except tweepy.TweepError as e:
            # Just exit if any error
            print("some error : " + str(e))
            break
    print("Downloaded {0} tweets, Saved to {1}".format(tweetCount, fName))


# Replace the API_KEY and API_SECRET with your application's key and secret.
auth = tweepy.AppAuthHandler(config[0], config[1])

api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

if (not api):
    print("Can't Authenticate")
    sys.exit(-1)

# Continue with rest of code

# Fetch search terms from `Search.txt`
with open('Search.txt', 'r') as file:
    search_list = file.readlines()
    search_list = [x.strip() for x in search_list]

for term in search_list:
Пример #26
0
import urllib
import json
import tweepy
from flask import Flask, jsonify
from tweepy.parsers import JSONParser
from multiprocessing import Process, Queue

app = Flask(__name__)

TWITTER_API_KEY = "Replace this string with your twitter api key"
TWITTER_API_SECRET = "Replace this string with your twitter api secret key"
GOOGLE_API_KEY = "Replace this string with your google api key"
GOOGLE_API_CX = "Replace this string with your google api cx"


auth = tweepy.AppAuthHandler(TWITTER_API_KEY, TWITTER_API_SECRET)
api = tweepy.API(auth,
			   wait_on_rate_limit=True,
			   wait_on_rate_limit_notify=True,
			   parser=tweepy.parsers.JSONParser()
			   )
if (not api):
	print ("Can't Authenticate Twitter")
	sys.exit (-1)


#For storing the results of the three following parallel functions
result_queue = Queue()		


#DuckDuckGo instant API
Пример #27
0
from sklearn.externals import joblib
import tweepy
from threading import Thread
from flask_sqlalchemy import SQLAlchemy
from models import *

app = Flask(__name__)
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///gbrannotation.sqlite3'
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
app.secret_key = os.urandom(24)
db = SQLAlchemy(app)
thread = None

clf = joblib.load(os.path.join(config.APP_STATIC, 'gbr_multi_label.pkl'))
mlb = joblib.load(os.path.join(config.APP_STATIC, 'mlb.pkl'))
auth = tweepy.AppAuthHandler(config.CONSUMER_KEY, config.CONSUMER_SECRET)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
SEARCH_TERM = ['Great Barrier Reef', 'GBR', 'greatbarrierreef']

auth = tweepy.AppAuthHandler(config.CONSUMER_KEY, config.CONSUMER_SECRET)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)


def predict(sentence):
    predicted = clf.predict(sentence)
    inverse_pred = mlb.inverse_transform(predicted)
    return inverse_pred


def background_thread():
    """Example of how to send server generated events to clients."""
Пример #28
0
 def __init__(self):
     # Authenticate to the twitter api
     self.auth = tweepy.AppAuthHandler(os.environ['TWITTER_API_KEY'],
                                       os.environ['TWITTER_API_SECRET'])
     self.api = tweepy.API(self.auth, wait_on_rate_limit=True)
Пример #29
0
import tweepy

# Replace the API_KEY and API_SECRET with your application's key and secret.
auth = tweepy.AppAuthHandler(
    "sFH28qEj9uf9zWt8Ecws9h8jS",
    "0lkJlb2jBVNeahQ40EZs84W7mhjmXLWF12AGQpqDBI8yj1pffY")

api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

if (not api):
    print("Can't Authenticate")
    sys.exit(-1)

# Continue with rest of code
import sys
import jsonpickle
import os

searchQuery = ['Hillary', 'Clinton',
               '#imwithher']  # this is what we're searching for
maxTweets = 10000000  # Some arbitrary large number
tweetsPerQry = 100  # this is the max the API permits
fName = 'hillary_tweets.txt'  # We'll store the tweets in a text file.

# If results from a specific ID onwards are reqd, set since_id to that ID.
# else default to no lower limit, go as far back as API allows
sinceId = 725171200100433920

# If results only below a specific ID are, set max_id to that ID.
# else default to no upper limit, start from the most recent tweet matching the search query.
#max_id = -1L
Пример #30
0
def get_data():

    # ATOKEN = open("/Users/mmiyazaki/Documents/My project/Airline_analysis/src/get_data/credentials/atoken.txt","r").read()
    # ASECRET = open("/Users/mmiyazaki/Documents/My project/Airline_analysis/src/get_data/credentials/asecret.txt","r").read()
    CKEY = open(
        "/Users/mmiyazaki/Documents/My project/Airline_analysis/src/get_data/credentials/ckey.txt",
        "r").read()
    CSECRET = open(
        "/Users/mmiyazaki/Documents/My project/Airline_analysis/src/get_data/credentials/csecret.txt",
        "r").read()

    with open(
            "/Users/mmiyazaki/Documents/My project/Airline_analysis/src/data/companies.txt"
    ) as file_in:
        companies = []
        for line in file_in:
            companies.append(line.replace('\n', ""))

    auth = tweepy.AppAuthHandler(CKEY, CSECRET)

    api = tweepy.API(auth,
                     wait_on_rate_limit=True,
                     wait_on_rate_limit_notify=True)

    if (not api):
        print("Can't Authenticate")
        sys.exit(-1)

    df = pd.DataFrame()

    for company in companies:
        print(company)
        searchQuery = company + " -RT"  # this is what we're searching for
        print("search query :", searchQuery)
        maxTweets = 200  # Some arbitrary large number
        tweetsPerQry = 100  # this is the max the API permits
        # tweepy_REST_API = dataiku.Folder("tweepy_REST_API")
        # folder_path = tweepy_REST_API.get_path()
        # fName = folder_path + '/tweets.txt' # We'll store the tweets in a text file.

        # If results from a specific ID onwards are reqd, set since_id to that ID.
        # else default to no lower limit, go as far back as API allows
        sinceId = None

        # If results only below a specific ID are, set max_id to that ID.
        # else default to no upper limit, start from the most recent tweet matching the search query.
        max_id = -1

        tweetCount = 0

        print("Downloading max {0} tweets".format(maxTweets))
        # with open(fName, 'w') as f:
        while tweetCount < maxTweets:
            try:
                if (max_id <= 0):
                    if (not sinceId):
                        new_tweets = api.search(q=searchQuery,
                                                count=tweetsPerQry,
                                                lang="en")
                    else:
                        new_tweets = api.search(q=searchQuery,
                                                count=tweetsPerQry,
                                                since_id=sinceId,
                                                lang="en")
                else:
                    if (not sinceId):
                        new_tweets = api.search(q=searchQuery,
                                                count=tweetsPerQry,
                                                max_id=str(max_id - 1),
                                                lang="en")
                    else:
                        new_tweets = api.search(q=searchQuery,
                                                count=tweetsPerQry,
                                                max_id=str(max_id - 1),
                                                since_id=sinceId,
                                                lang="en")
                if not new_tweets:
                    print("No more tweets found")
                    break
                for tweet in new_tweets:
                    # f.write(jsonpickle.encode(tweet._json, unpicklable=False) + '\n')

                    if len(tweet._json['entities']['urls']) == 0:
                        links = ""
                    else:
                        links = tweet._json['entities']['urls'][0][
                            "expanded_url"]

                    if tweet._json["place"] == None:
                        country = ""
                    else:
                        country = tweet._json["place"]['country']

                    if tweet._json["geo"] != None:
                        coordinates = str(tweet._json["geo"]['coordinates'])
                    else:
                        coordinates = ""

                    new_row = pd.DataFrame({
                        'timestamp':
                        tweet._json['created_at'],
                        'tweet_id': [tweet._json["id"]],
                        'text':
                        tweet._json["text"],
                        'hashtags': [tweet._json["entities"]["hashtags"]],
                        "links":
                        links,
                        # 'user_mentions':tweet._json["entities"]["user_mentions"][0]["screen_name"],
                        # 'user_mentions_id':tweet._json["entities"]["user_mentions"][0]["id"],
                        # 'user_mentions_indices':[tweet._json["entities"]["user_mentions"][0]["indices"]],
                        'in_reply_to_status_id':
                        tweet._json["in_reply_to_status_id"],
                        'in_reply_to_user_id':
                        tweet._json["in_reply_to_user_id"],
                        'in_reply_to_screen_name':
                        tweet._json["in_reply_to_screen_name"],
                        'user_id':
                        tweet._json["user"]["id"],
                        'username':
                        tweet._json["user"]["name"],
                        'screen_name':
                        tweet._json["user"]["screen_name"],
                        'user_location':
                        tweet._json["user"]["location"],
                        'followers_count':
                        tweet._json["user"]["followers_count"],
                        'friends_count':
                        tweet._json["user"]["friends_count"],
                        'user_creation':
                        tweet._json["user"]["created_at"],
                        'favourites_count':
                        tweet._json["user"]["favourites_count"],
                        'coordinates':
                        coordinates,
                        # 'geo':tweet._json["geo"],
                        'country':
                        country,
                        'retweets':
                        tweet._json["retweet_count"],
                        'retweeted':
                        tweet._json["retweeted"],
                        'lang':
                        tweet._json["lang"]
                    })
                    new_row["company"] = company
                    df = df.append(new_row, ignore_index=True)

                tweetCount += len(new_tweets)
                print("Downloaded {0} tweets".format(tweetCount))
                max_id = new_tweets[-1].id
            except tweepy.TweepError as e:
                # Just exit if any error
                print("some error : " + str(e))
                break

        print("Downloaded {0} tweets for {1}".format(tweetCount, company))

    df.to_csv(
        "/Users/mmiyazaki/Documents/My project/Airline_analysis/src/data/raw_tweets.csv"
    )