def tweet_count_json(self): """ reading the count of different types of tweets by day :return: """ count_dict = {} files = [ f for f in listdir(self.data_path) if isfile(join(self.data_path, f)) ] filters = { "tweet_type": ['retweet', 'quote', 'reply', 'original'], "lang": ["fa"] } hashtags = du.get_hashtags() tags = [] for k, v in hashtags.items(): tags.append( k.replace('\n', '').replace('\r', '').replace('#', '').strip()) for file in files: if file.endswith(".json") and not file.startswith("~$"): df = pd.read_json(self.data_path + file, lines=True) print(len(df)) break for index, row in df.iterrows(): if row["tweet_type"] in filters["tweet_type"] and row[ "lang"] in filters["lang"] and any( tag in row["text"] for tag in tags): date = pd.to_datetime(du.standardize_tweet_time( row["created_at"]), format='%Y-%m-%d').date() # check if the day is already in the dictionary if date not in count_dict: count_dict[date] = { "retweet": 0, "quote": 0, "reply": 0, "original": 0 } count_dict[date][row["tweet_type"]] += 1 # reformatting count_dict = pd.DataFrame(count_dict).T count_dict["date"] = count_dict.index count_dict = count_dict.reset_index(drop=True) print(list(count_dict)) print(count_dict) count_dict = count_dict.sort_values(count_dict.columns[4], ascending=True) return count_dict
def clean_data_json(self, save_checkpoint=100): """ reading tweets' json data :param save_checkpoint: saving point of tweets' data frame :return: """ df_columns = [ "id", "created_at", "text", "user_description", "user_followers_count", "user_friends_count", "user_location", "user_location_carmen", "user_statuses_count", "user_verified" ] df_filtered = pd.DataFrame(columns=df_columns) files = [ f for f in listdir(self.data_path) if isfile(join(self.data_path, f)) ] filters = {"lang": ["fa"]} hashtags = du.get_hashtags() tags = [] for k, v in hashtags.items(): tags.append( k.replace('\n', '').replace('\r', '').replace('#', '').strip()) file_name_idx = 0 for file in files: if file.endswith(".json") and not file.startswith("~$"): with open(self.data_path + file) as f: for line in f: row = json.loads(line) if "user" in row and row[ "in_reply_to_status_id_str"] in [ "", None ] and row["lang"] in filters[ "lang"] and 'RT' not in row[ "text"] and any(tag in row["text"] for tag in tags): # location is created by CARMEN, if resolved if "location" in row and "country" in row["location"] and \ row["location"]["country"] not in [None, "NaN", ""]: user_location = row["location"]["country"] else: user_location = "" df_filtered = df_filtered.append( { "id": row["id"], "created_at": row["created_at"], "text": clean_persian_tweets(row["text"]), "user_description": row["user"]["description"], "user_followers_count": row["user"]["followers_count"], "user_friends_count": row["user"]["friends_count"], "user_location": row["user"]["location"], "user_location_carmen": user_location, "user_statuses_count": row["user"]["statuses_count"], "user_verified": row["user"]["verified"] }, ignore_index=True) # saving file at checkpoint if len(df_filtered) % save_checkpoint == 0: df_filtered.to_excel(self.cleaned_tweet_path + str(file_name_idx) + ".xlsx") file_name_idx += 1 # reset the data frame df_filtered = pd.DataFrame(columns=df_columns) # saving the last file if len(df_filtered) > 0: df_filtered.to_excel(self.cleaned_tweet_path + str(file_name_idx) + ".xlsx")
def clean_data_excel(self, count=100, save_checkpoint=100): """ reading tweets' excel data :param count: number of records to be in the final data frame :param save_checkpoint: saving point of tweets' data frame :return: """ df_columns = [ "id", "tweet_url", "created_at", "text", "user_description", "user_followers_count", "user_friends_count", "user_location", "user_statuses_count", "user_verified" ] df_filtered = pd.DataFrame(columns=df_columns) files = [ f for f in listdir(self.data_path) if isfile(join(self.data_path, f)) ] filters = {"tweet_type": ["original"], "lang": ["fa"]} tags = du.get_hashtags() tags = [k for k, v in tags.items() if "fa" in v] file_name_idx = 0 for file in files: if file.endswith(".xlsx") and not file.startswith("~$"): df = pd.read_excel(self.data_path + file, nrows=count) for index, row in df.iterrows(): if row["tweet_type"] in filters["tweet_type"] and row[ "lang"] in filters["lang"] and any( tag in row["text"] for tag in tags): df_filtered = df_filtered.append( { "id": row["id"], "tweet_url": row["tweet_url"], "created_at": row["created_at"], "text": clean_persian_tweets(row["text"]), "user_description": row["user_description"], "user_followers_count": row["user_followers_count"], "user_friends_count": row["user_friends_count"], "user_location": row["user_location"], "user_statuses_count": row["user_statuses_count"], "user_verified": row["user_verified"] }, ignore_index=True) # saving file at checkpoint if len(df_filtered) % save_checkpoint == 0: df_filtered.to_excel(self.cleaned_tweet_path + str(file_name_idx) + ".xlsx") file_name_idx += 1 # reset the data frame df_filtered = pd.DataFrame(columns=df_columns) # saving the last file if len(df_filtered) > 0: df_filtered.to_excel(self.cleaned_tweet_path + str(file_name_idx) + ".xlsx")