def __init__(self, detailsList, proxies=None, logger=None, relative_dir=""): ''' Parameters: ___________ detailsList (list): List containing keyword, coinname in string and start and end in date format proxies (list of dict or None): list of dict in proxies format (containing http, https and ftp) to use for each next query. Else None to not use logger (logger): Saves to file if not provided else default ''' _, self.currRoot_dir = get_locations() if logger == None: self.logger = get_logger( os.path.join(self.currRoot_dir + "logs/twitterscraper.log")) else: self.logger = logger self.detailsList = detailsList self.relative_dir = relative_dir self.proxies = proxies
def __init__(self, proxy=None, logger=None): ''' Parameters: ___________ logger: (logger) logger object to log all this proxy: (dict) Single Proxy to use or None. Dictionary containing http, https and ftp proxy to use for using with requests ''' self.HEADERS_LIST = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0', 'Mozilla/5.0 (X11; Linux x86_64; rv:58.0) Gecko/20100101 Firefox/58.0' ] self.INIT_URL = "https://twitter.com/search?vertical=tweets&vertical=default&q={q}&l={lang}" self.RELOAD_URL = "https://twitter.com/i/search/timeline?vertical=" \ "default&include_available_features=1&include_entities=1&" \ "reset_error_state=false&src=typd&max_position={pos}&q={q}&l={lang}" _, self.currRoot = get_locations() if logger == None: self.logger = get_logger(self.currRoot + "/logs/twitterscraper.log") else: self.logger = logger self.proxy = proxy
def setup_method(self): self.keywords = { 'bitcoin': ['bitcoin', 'BTC'], 'dashcoin': ['dashcoin', 'DASH', 'darkcoin'], 'dogecoin': ['dogecoin', 'DOGE'], 'ethereum': ['ethereum', 'ETH'], 'litecoin': ['litecoin', 'LTC'], 'ripple': ['ripple', 'XRP'], 'monero': ['monero', 'XMR'], 'stellar': ['stellar', 'STR'] } self.keywordsOnly = [ value for key, values in self.keywords.items() for value in values ] _, self.currRoot_dir = get_locations() self.logger = get_logger(self.currRoot_dir + '/livescraper/tests/live.log') self.listener = MyStreamListener(self.keywords, self.logger, tweetCount=10) consumer_key, consumer_secret, access_token, access_token_secret = get_twitter( ) auth = OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) self.myStream = Stream(auth=auth, listener=self.listener) self.myStream.filter(track=self.keywordsOnly, languages=['en']) self.df, self.userData, _ = self.listener.get_data()
def __init__(self, keywords, historicList, proxies, relative_dir="/"): ''' Runs everything for twitter Parameters: ___________ keywords: historicList: proxies: relative_dir: ''' _, currRoot_dir = get_locations() self.relative_dir = relative_dir self.currDir = os.path.join(currRoot_dir, relative_dir) self.historic_path = os.path.join(self.currDir, "data/tweet/{}/historic_scrape") self.logger = get_logger(self.currDir + '/logs/live.txt') self.coins = [key for key, value in keywords.items()] self.historicList = historicList self.proxies = proxies
def __init__(self, keywords, logger=None, tweetCount=0): ''' Parameters: ___________ keywords: (dict) Dictionary containing coinname and its relevant keywords Example: {'bitcoin': ['bitcoin', 'BTC'], 'dashcoin': ['dashcoin', 'DASH', 'darkcoin']} tweetCount (int) (optional): If not set to 0, the program will terminate after n tweets is found ''' _, self.currRoot_dir = get_locations() if (logger == None): self.logger = get_logger(self.currRoot_dir + '/logs/live.txt') else: self.logger = logger self.api = API() self.df = pd.DataFrame(columns=[ 'ID', 'Tweet', 'Time', 'User', 'Likes', 'Replies', 'Retweets', 'in_response_to', 'response_type', 'coinname' ]) self.userData = pd.DataFrame(columns=[ 'username', 'created', 'location', 'has_location', 'is_verified', 'total_tweets', 'total_following', 'total_followers', 'total_likes', 'has_avatar', 'has_background', 'is_protected', 'profile_modified' ]) self.keywords = keywords self.start_time = int(time.time()) self.tweetCount = tweetCount self.statusCount = 0
def __init__(self, keywords, logger=None, tweetCount=0): """ Parameters: ___________ keywords (dictionary): Dictionary containing coinname and its relevant keywords Example: {'bitcoin': ['bitcoin', 'BTC'], 'dashcoin': ['dashcoin', 'DASH', 'darkcoin']} tweetCount (int) (optional): If not set to 0, the program will terminate after n tweets is found """ #Relative directory needs to be moved here not in function below. And replace all with os.path.join _, self.currRoot_dir = get_locations() self.tweetCount = tweetCount self.keywords = keywords self.coins = [key for key, value in keywords.items()] self.keywordsOnly = [ value for key, values in keywords.items() for value in values ] if (logger == None): self.logger = get_logger(self.currRoot_dir + '/logs/live.log') else: self.logger = logger runUtils(self.keywords).create_directory_structure()
def __init__(self, keywords, logger=None): self.keywords = keywords self.coins = [key for key, value in keywords.items()] _, self.currRoot_dir = get_locations() if (logger == None): self.logger = get_logger(self.currRoot_dir + '/logs/run_utils.txt') else: self.logger = logger
def __init__(self, proxy=None, logger=None): self.HEADERS_LIST = ['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0', 'Mozilla/5.0 (X11; Linux x86_64; rv:58.0) Gecko/20100101 Firefox/58.0'] self.proxy = proxy _, self.currRoot = get_locations() if logger == None: self.logger = get_logger(os.path.join(self.currRoot, "logs/profilescraper.log")) else: self.logger = logger
def test_get_logger(): name, rootDir = get_locations() flocation = rootDir + "/libs/tests/test.log" logger = get_logger(flocation) logger.info("abc") assert (os.path.exists(flocation)) with open(flocation, 'r') as f: assert (f.readlines()[0] == 'abc\n') os.remove(flocation) assert (not (os.path.exists(flocation)))
def __init__(self, detailsList, relative_dir=""): ''' Parameters: ___________ detailsList (list): List containing keyword, coinname in string and start and end in date format. For looping relative_dir (string): The relative directory ''' _, currRoot_dir = get_locations() self.logger = get_logger(os.path.join(currRoot_dir, "logs/profileprocess.log")) self.detailsList = detailsList self.profile_path = os.path.join(currRoot_dir, relative_dir, "data/profile")
def __init__(self, profiles, proxies=None, relative_dir=""): ''' Functions to call to save historic profiles Parameters: __________ profiles (Pandas Series or list or numpy): Series of profile name ''' self.profiles = profiles _, self.currRoot = get_locations() self.logger = get_logger(os.path.join(self.currRoot, "logs/profilescraper.log")) self.path = os.path.join(self.currRoot, relative_dir, "data/profile") self.proxies = proxies
def __init__(self, detailsList, algo_name, relative_dir=""): ''' Parameters: ___________ detailsList (list): List containing keyword, coinname in string and start and end in date format. For looping algo_name (string): The name of the algorithm so as to save in file relative_dir (string): The relative directory ''' _, currRoot_dir = get_locations() self.logger = get_logger(os.path.join(currRoot_dir, "/logs/historicprocess.log")) self.detailsList = detailsList self.algo_name = algo_name self.historic_path = os.path.join(currRoot_dir, relative_dir, "data/tweet/{}/historic_scrape")
def setup_method(self): self.keywords = { 'bitcoin': ['bitcoin', 'BTC'], 'dashcoin': ['dashcoin', 'DASH', 'darkcoin'], 'dogecoin': ['dogecoin', 'DOGE'], 'ethereum': ['ethereum', 'ETH'], 'litecoin': ['litecoin', 'LTC'], 'ripple': ['ripple', 'XRP'], 'monero': ['monero', 'XMR'], 'stellar': ['stellar', 'STR'] } self.keywordsOnly = [ value for key, values in self.keywords.items() for value in values ] _, self.currRoot_dir = get_locations() self.logger = get_logger(self.currRoot_dir + '/livescraper/tests/live.log') self.qt = query_live_tweets(self.keywords, tweetCount=10) listener, auth = self.qt.get_listener(create=True) self.qt.perform_search() self.df, self.userData, _ = listener.get_data()
shutil.rmtree("{}/data/tweet/{}".format( self.currDir, coinname)) except: pass self.logger.info("Removing {}/data/profile/{}".format( self.currDir, coinname)) try: shutil.rmtree("{}/data/profile".format(self.currDir, coinname)) except: pass _, currRoot_dir = get_locations() logger = get_logger(currRoot_dir + '/logs/run_live.txt') liveKeywords, historicList = get_keywords() proxies = get_proxies() ra = runAll(liveKeywords, historicList, proxies=proxies) ra.initial_houskeeping(clean=False) #change when required t1 = threading.Thread(target=download_live, args=[liveKeywords, logger]) t1.start() logger2 = get_logger(currRoot_dir + '/logs/run_live_2.txt') #house folder contains kunai ma nabhako data while True: try: