def write_to_file_chunks(file_path, lines, chunk_size=100): """ This function writes a list to a file in the file_path. It divides the file lines into chunks and writes the chunks instead of writing individual lines. Parameters ---------- file_path : str The file path. lines : list The lines we want to write to the file. chunk_size : int or 100 The chunk size to write in one IO operation. Returns ------- """ try: FileUtil.check_directory(file_path) f = io.open(file_path, 'a', encoding="utf-8") if len(lines) < 100: lines = map(lambda x: x + '\n', lines) f.writelines(lines) else: chunks = list(FileUtil.chunks(lines, chunk_size)) for chunk in chunks: chunk = map(lambda x: x + '\n', chunk) f.writelines(chunk) f.close() except Exception as e: print(ErrorWrapper(e).handle())
def csv_reader(file_path, columns_names, index_col_name): """ This function reads a csv file in the file_path and returns a list of its lines. Parameters ---------- file_path : str The file path. columns_names : list The columns names you want to read. index_col_name : str The index column name. Returns ------- """ try: csv_file = pandas.read_csv(file_path, sep="[,]", lineterminator='\n', engine='python', header=None, names=columns_names, index_col=index_col_name, quoting=csv.QUOTE_NONE) rows = [tuple(x) for x in csv_file.values] rows = rows[1:] rows = [(day,) + x for x, day in zip(rows, list(csv_file.index.values)[1:])] return rows except Exception as e: print(ErrorWrapper(e).handle())
def check_directory(file_path): """ This function checks the existence of a directory and creates it if it is not existent. Parameters ---------- file_path : str The file path. Returns ------- """ try: the_dir_path = file_path.split('/') if len(the_dir_path) > 1: file_name = the_dir_path[len(the_dir_path)-1] the_directory = file_path[:-len(file_name)] if not os.path.exists(the_directory): path = Path(the_directory) path.mkdir(parents=True) except Exception as e: print(ErrorWrapper(e).handle())
def handle_status(self, status, config): """ This function either stores the status in the statuses list or calls handle_batching depending on the time elapsed. Parameters ---------- status : tweepy.Status The status to handle. config : str The application keys file path (for reactions collecting). Returns ------- """ try: clock = datetime.now() if self.hourly_elapsed_time < 60: self.parse(status) #print(self.hourly_elapsed_time) self.hourly_elapsed_time = (clock - self.hourly_start_time).total_seconds()/60 self.daily_elapsed_time = (clock - self.daily_start_time).days else: self.hourly_elapsed_time = 0 self.hourly_start_time = clock self.handle_batching() if self.daily_elapsed_time > 3: self.daily_elapsed_time = 0 self.daily_start_time = clock self.handle_reactions(config, self.days_range) except Exception as e: ErrorWrapper(e).handle()
def get_ids(self, days_range): """ This function gets the statuses ids for tweets which came days_range days ago. Parameters ---------- days_range : int The number of days range to count back. Returns ------- ids : list The days_range previous days tweets' ids. """ try: today = datetime.today() ids = [] dates = DatesHandler(today, today - timedelta(days_range)).date_range() for date in dates: day_ids = [] only_files = [f for f in listdir(self.location+"/1/"+str(date.date())) if isfile(join(self.location+"/1/"+str(date.date()), f))] for f in only_files: hour_ids = {"data": FileUtil.file_reader(self.location + "/1/" + str(date.date()) + "/" + f), "index": f} day_ids.append(hour_ids) ids.append(day_ids) return ids except Exception as e: ErrorWrapper(e).handle()
def handle_reactions(self, config, days_range): """ This function gets the favourites and retweets count and writes them in file system. Parameters ---------- config : str The Twitter application configuration file path. days_range : int The previous days range to get. Returns ------- """ try: (favourites_count, retweets_count) = self.get_reactions(config, days_range) today = datetime.today() dates = DatesHandler(today, today - timedelta(days_range)).date_range() i = self.days_range for date in dates: favourites_files_path = self.location + "/16/" + str(date.date()) + "/" retweets_files_path = self.location + "/17/" + str(date.date()) + "/" for j in range(len(favourites_count[i])): FileUtil.write_to_file_chunks(favourites_files_path+favourites_count[i][j]["index"], favourites_count[i][j]["data"], 500) FileUtil.write_to_file_chunks(retweets_files_path + retweets_count[i][j]["index"], retweets_count[i][j]["data"], 500) i = i + 1 except Exception as e: ErrorWrapper(e).handle()
def build_api(self): """ This function builds the api instance using the config attribute. Parameters ---------- Returns ------- Api The twitter API instance. """ try: config = FileUtil.file_reader(self.config) auth = OAuthHandler(config[0], config[1]) auth.set_access_token(config[2], config[3]) return API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True, retry_count=10, retry_delay=5, retry_errors=5) except Exception as e: print(ErrorWrapper(e).handle())
def write_to_file(file_path, lines): """ This function writes a list to a file in the file_path. Parameters ---------- file_path : str The file path. lines : list The lines we want to write to the file. Returns ------- """ try: FileUtil.check_directory(file_path) f = io.open(file_path, 'a', encoding="utf-8") for n, line in enumerate(lines): if line.startswith(" "): lines[n] = "" + line.rstrip() else: lines[n] = line.rstrip() f.write(u''.join(line+'\n')) f.close() except Exception as e: print(ErrorWrapper(e).handle())
def parse(self, status): """ This function parses the Status object and adds every attribute in its suitable partial list. Parameters ---------- status : tweepy.Status The status to parse. Returns ------- """ try: text = status.text text = re.sub(u"\n", u" ", text) text = re.sub(u"\\s+", u" ", text) self.statuses[0].append(str(status.id)) self.statuses[1].append(text) if hasattr(status, 'retweeted_status'): self.statuses[2].append(str(True)) self.statuses[7].append(str(status.retweeted_status.id)) self.statuses[8].append(str(status.retweeted_status.user.screen_name)) self.statuses[9].append(str(status.retweeted_status.retweet_count)) self.statuses[10].append(str(status.retweeted_status.favorite_count)) else: self.statuses[2].append(str(False)) self.statuses[7].append(str(None)) self.statuses[8].append(str(None)) self.statuses[9].append(str(None)) self.statuses[10].append(str(None)) if hasattr(status, 'place') and hasattr(status.place, 'country_code'): self.statuses[3].append(str(status.place.country_code)) else: self.statuses[3].append(str(None)) self.statuses[4].append(str(status.created_at)) self.statuses[5].append(str(status.user.id)) self.statuses[6].append(str(status.user.screen_name)) self.statuses[11].append(str(len(status.entities['urls']))) self.statuses[12].append(str(len(status.entities['hashtags']))) self.statuses[13].append(str(status.user.followers_count)) self.statuses[14].append(str(status.user.friends_count)) except Exception as e: ErrorWrapper(e).handle()
def get_reactions(self, config, days_range): """ This function gets the favourites and retweets count for them to store in file system. Parameters ---------- config : str The Twitter application configuration file path. days_range : int The number of days range to count back. Returns ------- favourites_count : list A list of favourites count for every collected tweet in the past days_range days. retweets_count : list A list of retweets count for every collected tweet in the past days_range days. """ try: ids = self.get_ids(days_range) favourites_count = [] retweets_count = [] api = TwitterApiWrapper(config) for day in ids: day_favourites = [] day_retweets = [] for hour in day: hour_favourites = {"data": [], "index": ""} hour_retweets = {"data": [], "index": ""} for tweet_id in hour["data"]: hour_favourites["data"].append(str(api.get_favourites(tweet_id))) hour_retweets["data"].append(str(api.get_retweets(tweet_id))) hour_favourites["index"] = hour["index"] hour_retweets["index"] = hour["index"] day_favourites.append(hour_favourites) day_retweets.append(hour_retweets) favourites_count.append(day_favourites) retweets_count.append(day_retweets) return favourites_count, retweets_count except Exception as e: ErrorWrapper(e).handle()
def handle_batching(self): """ This function writes the statuses list in the file system. Parameters ---------- Returns ------- """ try: for i in range(0, 15, 1): store_path = self.location+"/"+str(i+1)+"/"+str(datetime.today().date())+"/"+str(datetime.now().time().hour) + "__" + str(i)+".txt" FileUtil.write_to_file_chunks(store_path, self.statuses[i], 500) self.statuses[i] = [] except Exception as e: ErrorWrapper(e).handle()
def date_range(self): """ This function gets a range of dates between the start_date and end_date attributes. Parameters ---------- Yields ------- list The range of datetime items. """ try: for n in range(int((self.end_date - self.start_date).days) + 1): yield self.start_date + timedelta(n) except Exception as e: print(ErrorWrapper(e).handle())
def file_reader(file_path): """ This function reads a file in the file_path and returns a list of its lines. Parameters ---------- file_path : str The file path. Returns ------- """ try: f = open(file_path, encoding="utf-8", buffering=(2 << 16) + 8) lines = f.read().splitlines() return lines except Exception as e: print(ErrorWrapper(e).handle())
def get_stream_api_instance(self): """ This function builds a streaming api instance using the api instance. Parameters ---------- Returns ------- Stream The twitter streaming API instance. """ try: api_instance = self.build_api() stream = Stream(auth=api_instance.auth, listener=self.listener) return stream except Exception as e: print(ErrorWrapper(e).handle())
def get_user(self, status_id): """ This function gets the status author User's object. Parameters ---------- status_id : int The status ID. Returns ------- int The status author User's object. """ try: s = self.get_status_obj(status_id) return s.author except Exception as e: ErrorWrapper(e).handle()
def append_files(read_file_path, write_file_path): """ This function appends the file in read_file_path lines to the file in write_file_path. Parameters ---------- read_file_path : str The read file path. write_file_path : str The write file path. Returns ------- """ try: appended_file_lines = FileUtil.file_reader(read_file_path) FileUtil.write_to_file(write_file_path, appended_file_lines) except Exception as e: print(ErrorWrapper(e).handle())
def get_status_obj(self, status_id): """ This function gets the status using its ID. Parameters ---------- status_id : int The status ID. Returns ------- Status The resulting Status object. """ try: api = self.build_api() return api.get_status(status_id) except Exception as e: ErrorWrapper(e).handle()
def get_friends(self, status_id): """ This function gets the status's author friends count. Parameters ---------- status_id : int The status ID. Returns ------- int The status's author friends count. """ try: s = self.get_user(status_id) return len(s.friends_ids(s.id)) except Exception as e: ErrorWrapper(e).handle()
def __init__(self, start_time, location, days_range, config, batcher=None): """ Initializer for StatusParser class. Parameters ---------- start_time : datetime The batch start time. location : str The batch requested location in file system. days_range : int The number of previous days to look for when collecting reactions (favourites and retweets). batcher : Batcher or None The copy constructor object. config : str The application keys file path (for reactions collection). """ try: if batcher is None: self.daily_start_time = start_time self.hourly_start_time = start_time self.hourly_elapsed_time = 0 self.daily_elapsed_time = 0 self.location = location self.days_range = days_range self.config = config self.statuses = [[] for x in range(15)] else: self.daily_start_time = batcher.daily_start_time self.hourly_start_time = batcher.hourly_start_time self.hourly_elapsed_time = batcher.hourly_elapsed_time self.daily_elapsed_time = batcher.daily_elapsed_time self.location = batcher.location self.days_range = batcher.days_range self.config = batcher.config self.statuses = batcher.statuses except Exception as e: ErrorWrapper(e).handle()
def write_item(file_path, line): """ This function writes a line to file. Parameters ---------- file_path : str The write file path. line : str The item we want to write on a line in the file. Returns ------- """ try: f = io.open(file_path, 'a', encoding="utf-8") f.write(line) f.close() except Exception as e: print(ErrorWrapper(e).handle())
def filter(self, keywords, stream, languages=None): """ This function opens a connection with Twitter and starts to filter tweets coming from the stream on Keywords list. Parameters ---------- keywords : list The keywords to filter on list. stream : Stream The stream instance. languages : list The requested tweets language Returns ------- """ try: stream.filter(track=keywords, async=True, languages=languages) except Exception as e: print(ErrorWrapper(e).handle())
def __init__(self, config_file_path, listener_class=None): """ Initializer for TwitterApiWrapper class. Parameters ---------- config_file_path : str The application's keys file path. listener_class : object or None The listener implementation class name. """ try: self.config = config_file_path if listener_class is not None: if listener_class.__class__.__bases__[0] is StreamListener: self.listener = listener_class else: raise ParametersError( Exception("wrong parameters class type"), 50) except ParametersError as e: print(ErrorWrapper(e.ex, e.code).handle())
def extract_date_info(read_file_path, hours_write_file_path, week_days_write_file_path): """ This function extracts the hour and the week day from date input file. It stores the results in output files whose paths are function arguments. Parameters ---------- read_file_path : str The date data file path. hours_write_file_path : str The desired hours only data file path. week_days_write_file_path : str The desired week days only data file path. Returns ------- """ try: lines = FileUtil.file_reader(read_file_path) hours = [] week_days = [] for line in lines: parts = line.split(" ") week_day = parts[0] time = parts[3] hour = time.split(":")[0] hours.append(hour) week_days.append(week_day) FileUtil.write_to_file(hours_write_file_path, hours) FileUtil.write_to_file(week_days_write_file_path, week_days) except Exception as e: print(ErrorWrapper(e).handle())
def get_retweets(self, status_id): """ This function gets the status's retweeting count using its ID. Parameters ---------- status_id : int The status ID. Returns ------- int The status retweeting count. """ try: s = self.get_status_obj(status_id) if s is not None: return s.retweet_count else: return -1 except Exception as e: ErrorWrapper(e).handle()