def preprocess(data): lang = data['lang'] text_xx = 'text_' + lang data['tweet_date'] = datetime.datetime.strptime(data['created_at'], '%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%dT%H:00:00.00Z') if 'extended_tweet' in data: data['tweet_text'] = data['extended_tweet']['full_text'] else: data['tweet_text'] = data['text'] if 'geo' in data and data['geo'] and 'coordinates' in data['geo'] and 'coordinates' in data['geo']['coordinates']: data['tweet_loc'] = ','.join(str(x) for x in data['geo']['coordinates']['coordinates']) elif 'place' in data and data['place'] and 'bounding_box' in data['place']: data['tweet_loc'] = ','.join(str(x) for x in data['place']['bounding_box']['coordinates'][0][0]) parsed_text = p.parse(data['tweet_text']) data['tweet_emoticons'] = [t.match for e in [parsed_text.emojis, parsed_text.smileys] if e is not None for t in e ] data[text_xx] = p.clean(data['tweet_text']).lower() # words = re.findall(r'\w+', data[text_xx], flags = re.UNICODE) # if lang in lang_map: # data[text_xx] = remove_stop_words(stopwords.words(lang_map[lang]), words) # elif lang == 'hi': # data[text_xx] = remove_stop_words(hindi_words, words) # else: # data[text_xx] = remove_stop_words(thai_words, words) return data
def preprocess_data(X): """ :param X: :return: """ X_new = np.array(X) for i, x in enumerate(X_new): # For each sample tweet x in the domain X_new, clean all URLs and numbers p.set_options(*CLEAN_OPTION_SET) s = p.clean(x) # Replace all caps words with placeholder ALL_CAPS_WORD split_tweet = s.split() for w in split_tweet: if w.isupper(): s = s.replace(w, "ALL_CAPS_WORD") p.set_options(*PARSE_OPTION_SET) # Figure out if the tweet is a retweet - if so, replace it with the person being retweeted tweet = s s = p.parse(tweet) p.set_options(*TOKENIZING_OPTION_SET) if s.reserved is not None and "RT" in s.reserved[0].match: tweet_modified = s.mentions[0].match else: tweet_modified = p.tokenize(tweet) # Replace all special characters with placeholders to make them unique for ch in SPECIAL_CHAR_SET: tweet_modified = tweet_modified.replace( ch, SPECIAL_CHAR_SET[ch]) X_new[i] = tweet_modified return X_new
def clean_tweet_text(tweet_text): tweet_text = tweet_text.replace("’", "'").replace("…", "...") tweet_parser = p.parse(tweet_text) cleaned_tweet = tweet_text hash_tags = tweet_parser.hashtags if hash_tags is not None: for hash_tag in hash_tags: cleaned_tweet = cleaned_tweet.replace(hash_tag.match, " ".join(wordninja.split(hash_tag.match[1:]))) tweet_urls = tweet_parser.urls if tweet_urls is not None: for url_link in tweet_urls: cleaned_tweet = cleaned_tweet.replace(url_link.match, " url$$ ") tweet_emojis = tweet_parser.emojis if tweet_emojis is not None: for emoji in tweet_emojis: cleaned_tweet = cleaned_tweet.replace(emoji.match, " emoji$$ ") cleaned_tweet = cleaned_tweet.split("via")[0].split("|")[0].split(" - ")[0].split(" – ")[0] cleaned_tweet_tokens = [] for word_token in cleaned_tweet.split(" "): word_token = word_token.strip().rstrip() if word_token.endswith("$$") or word_token in COMMON_ENGLISH_WORDS: cleaned_tweet_tokens.append(word_token) elif len(word_token) > 0: split_tokens = [w for w in wordninja.split(word_token) if w not in string.punctuation] cleaned_tweet_tokens += [token for token in split_tokens if not is_number(token)] cleaned_tweet = " ".join(cleaned_tweet_tokens) return cleaned_tweet
def test_set_options(self): tweet = "Preprocessor now has custom #options support! https://github.com/s/preprocessor" p.set_options(p.OPT.URL) parsed_tweet = p.parse(tweet) self.assertIsNone(parsed_tweet.hashtags) self.assertIsNotNone(parsed_tweet.urls)
def test_set_options(self): tweet = 'Preprocessor now has custom #options support! https://github.com/s/preprocessor' p.set_options(p.OPT.URL) parsed_tweet = p.parse(tweet) self.assertIsNone(parsed_tweet.hashtags) self.assertIsNotNone(parsed_tweet.urls)
def clean_tweet(tweet): tweet_clean = {key: tweet[key] for key in ['created_at', 'id', 'id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'retweet_count', 'favorite_count', 'lang']} if 'full_text' in tweet.keys(): tweet_clean['text'] = tweet['full_text'] elif 'extended_tweet' in tweet.keys(): tweet_clean['text'] = tweet['extended_tweet']['full_text'] else: tweet_clean['text'] = tweet['text'] if 'quote_count' in tweet.keys(): tweet_clean['quote_count'] = tweet['quote_count'] if 'reply_count' in tweet.keys(): tweet_clean['reply_count'] = tweet['reply_count'] tweet_clean['datetime'] = datetime.fromtimestamp(parser.parse(tweet['created_at']).timestamp()) if 'type' not in tweet.keys(): tweet_clean['type'] = tweet_type(tweet) if 'tweet_user_id' not in tweet.keys(): tweet_clean['tweet_user_id'] = tweet_creator(tweet)['id'] if 'tweet_user_id_str' not in tweet.keys(): tweet_clean['tweet_user_id_str'] = tweet_creator(tweet)['id_str'] if 'tweet_user_screen_name' not in tweet.keys(): tweet_clean['tweet_user_screen_name'] = tweet_creator(tweet)[ 'screen_name'] tweet_clean['timestamp'] = parser.parse(tweet['created_at']).timestamp() tweet_clean['text_processed'] = preprocess_text(tweet_clean['text']) text = tweetp.parse(tweet_clean['text']) tweet_clean['emojis'] = min(length(text.emojis), 127) tweet_clean['hashtags'] = min(length(text.hashtags), 127) tweet_clean['urls'] = min(length(text.urls), 127) tweet_clean['mentions'] = min(length(text.mentions), 127) return tweet_clean
def cleaning_sentence(sentence): stop_free = " ".join( [i for i in sentence.lower().split() if i not in stop]) hashtag_value = p.parse(stop_free).hashtags sentence = p.clean(stop_free) sentence = re.sub("[^A-Za-z .]+", "", sentence) normalized = " ".join(lemma.lemmatize(word) for word in sentence.split()) return normalized
def _parse(text): """ Parses elements from tweets :param text: :return: """ return preproc.parse(text)
def updating_columns(self): # Creating other columns for post self.posts['hashtags'] = self.posts.text.apply(lambda t: [h.match for h in p.parse(t).hashtags] if p.parse(t).hashtags else None) self.posts['time'] = self.posts.time.apply(datetime.fromtimestamp) # Creating other self.comments['time'] = self.comments.time.apply(datetime.fromtimestamp) self.comments['processed_comment'] = self.comments.text.str.lower() self.comments = stopwords_correction(self.comments, 'processed_comment')
def replace_hashtags(tweet): p.set_options(p.OPT.MENTION, p.OPT.HASHTAG, p.OPT.URL, p.OPT.RESERVED) t = p.parse(tweet) if t.hashtags: for i in t.hashtags: tweet = tweet[:i.start_index] + ' ' + tweet[i.start_index + 1:] return tweet
def tweet_cleaning(text): parsed = p.parse(text) emojis = [x.match for x in parsed.emojis] if not parsed.emojis is None else [] hashtags = [x.match for x in parsed.hashtags ] if not parsed.hashtags is None else [] p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED, p.OPT.NUMBER) text = p.clean(text) p.set_options(p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.HASHTAG) text2 = p.clean(text) return [text, text2, emojis, hashtags]
def insertTweetsIntoDB(self): concept = "coronavirus" analysisID = "passif" preProcessing = TweetsPreProcessing() dirPath = "C:/Users/Raouf/PycharmProjects/PFE_SII_M2/TweetFiles/" allFiles = [f for f in listdir(dirPath) if isfile(join(dirPath, f))] print(allFiles) for fileName in allFiles[63:]: print(fileName) if fileName.startswith("ExtractedTweetsFor"): if not fileName.endswith("Loaded.json"): # check if the file is loaded to the database or not fullFileName = dirPath + fileName tweetsFile = open(fullFileName, 'r', encoding="utf-8") tweets = json.load(tweetsFile) cpt = 0 for tweet in tweets['tweets']: tweetLanguage = tweet['lang'] if tweetLanguage == "en": allTweets = AllTweets() tweetID = tweet['id_str'] parsed_tweet = p.parse(tweet['text']) hashtagsList = parsed_tweet.hashtags print(hashtagsList) hashtagsText = "" if hashtagsList!= None: for hashtag in hashtagsList: print(hashtag.match) hashtagsText += " "+ str(hashtag.match) text = p.clean(tweet['text']) text = " ".join(re.findall('\w+', text)) text += ", hashtags : "+hashtagsText #text = "text" row = [tweetID, text] row += preProcessing.getLangage(tweet['lang']) + preProcessing.getLocation( tweet['user']['location']) \ + preProcessing.getTime(tweet['created_at']) + preProcessing.getSentimentAnalysis( tweet['text']) \ + preProcessing.getSource(tweet['source']) row += [analysisID, concept] #print(row) try: allTweets.insert(row) cpt += 1 print(cpt, "tweets ", sep=" ") except: print("erreur encodage") print("For the file : ", fileName, ", Tweets number is : ", cpt) tweetsFile.close()
def _tweet_preprocessing(tweet_texts): """Preprocessing 1. remove URLs (and put them in separate place) 2. remove Emojis Returns the cleaned Tweet and parsed URLs """ cleaned_tweet_texts = twitter_prepro.clean(tweet_texts) parsed_url = twitter_prepro.parse(tweet_texts).urls if parsed_url is not None: parsed_url = [u.match for u in parsed_url] return cleaned_tweet_texts, parsed_url
def get_tweet_url(tweet_text: str): parsed_tweet = cleaning_processor.parse(tweet_text) tweet_url = None if not parsed_tweet.urls: return tweet_url last_index = len(tweet_text) for url_info in parsed_tweet.urls: if url_info.end_index == last_index: tweet_url = url_info.match break return tweet_url
def parse_mentions(tweet): """ Parses a tweet for mentions. Parameters ---------- tweet : the text of a tweet. Returns A list of parsed mentions (or an empty list). """ parsed_tweet = p.parse(tweet) if parsed_tweet.mentions == None: return [] else: return [x.match for x in parsed_tweet.mentions]
def parse_hashtags(tweet): """ Parses a tweet for hashtags. Parameters ---------- tweet : the text of a tweet. Returns A list of parsed hashtags (or an empty list). """ parsed_tweet = p.parse(tweet) if parsed_tweet.hashtags == None: return [] else: return [x.match for x in parsed_tweet.hashtags]
def parse_reserved_words(tweet): """ Parses a tweet for reserved words ("RT" or "FAV"). Parameters ---------- tweet : the text of a tweet. Returns A list of parsed reserved words (or an empty list). """ parsed_tweet = p.parse(tweet) if parsed_tweet.reserved == None: return [] else: return [x.match for x in parsed_tweet.reserved]
def parse_smileys(tweet): """ Parses a tweet for smiley faces. Parameters ---------- tweet : the text of a tweet. Returns A list of parsed smiley faces (or an empty list). """ parsed_tweet = p.parse(tweet) if parsed_tweet.smileys == None: return [] else: return [x.match for x in parsed_tweet.smileys]
def parse_numbers(tweet): """ Parses a tweet for numbers. Parameters ---------- tweet : the text of a tweet. Returns A list of parsed numbers (or an empty list). """ parsed_tweet = p.parse(tweet) if parsed_tweet.numbers == None: return [] else: return [x.match for x in parsed_tweet.numbers]
def parse_urls(tweet): """ Parses a tweet for URLs. Parameters ---------- tweet : the text of a tweet. Returns A list of parsed URLs (or an empty list). """ parsed_tweet = p.parse(tweet) if parsed_tweet.urls == None: return [] else: return [x.match for x in parsed_tweet.urls]
def extract_info_from_cluster_table(cluster_edge_table): text_list = [] htag_list = [] url_list = [] for index,row in cluster_edge_table.iterrows(): username = row['user'] #if not 'tweets' in row: # continue # pass to the next user tweet_df = pd.read_json(row['tweets']) for idx,tweet in tweet_df.iterrows(): htags = tweet['hashtags'] urls = tweet['urls'] text = tweet['text'] retweet_count = tweet['retweet_count'] favorite_count = tweet['favorite_count'] parsed_tweet = tweetpre.parse(text) # extract emojis emojis = [] if parsed_tweet.emojis is not None: emojis = [emo.match for emo in parsed_tweet.emojis] tweetpre.set_options(tweetpre.OPT.MENTION, tweetpre.OPT.URL) filtered_text = tweetpre.clean(text) tweetpre.set_options() #emojis = parsed_tweet.emojis.match ??? url_c = [twitutils.convert_bitly_url(url_string) for url_string in urls] text_list.append({'text': text, 'user': username, 'url': url_c , 'emojis':emojis , 'retweet_count':retweet_count, 'favorite_count': favorite_count, 'filtered text': filtered_text, 'bcentrality': row['bcentrality']}) htag_list += htags url_list += urls if not text_list: empty_df = pd.DataFrame() return {'text': empty_df, 'hashtags': empty_df, 'words': empty_df, 'urls': empty_df} text_df = pd.DataFrame(text_list) mostcommon_words_df = most_common_words(text_df['filtered text']) hashtags_df = count_order_items(htag_list,'hashtag') url_df = count_order_items(url_list,'url') url_df = twitutils.convert_bitly_table(url_df) filtered_url_df = twitutils.drop_twitter_urls(url_df) return {'text': text_df, 'hashtags': hashtags_df, 'words': mostcommon_words_df, 'urls': filtered_url_df}
def test_parse(self): tweet = 'A tweet with #hashtag :) @mention 😀 and http://github.com/s.' parsed_tweet = p.parse(tweet) self.assertIsNotNone(parsed_tweet.urls) self.assertEqual(1, len(parsed_tweet.urls)) self.assertIsNotNone(parsed_tweet.hashtags) self.assertEqual(1, len(parsed_tweet.hashtags)) self.assertIsNotNone(parsed_tweet.mentions) self.assertEqual(1, len(parsed_tweet.mentions)) self.assertIsNone(parsed_tweet.reserved_words) self.assertIsNotNone(parsed_tweet.emojis) self.assertEqual(1, len(parsed_tweet.emojis)) self.assertEqual("😀", parsed_tweet.emojis[0].match) self.assertIsNotNone(parsed_tweet.smileys) self.assertEqual(1, len(parsed_tweet.smileys)) self.assertEqual(":)", parsed_tweet.smileys[0].match)
def test_parse(self): tweet = 'A tweet with #hashtag :) @mention 😀 and http://github.com/s.' p.set_options(p.OPT.URL, p.OPT.HASHTAG, p.OPT.MENTION, p.OPT.EMOJI, p.OPT.SMILEY) parsed_tweet = p.parse(tweet) self.assertIsNotNone(parsed_tweet.urls) self.assertEqual(1, len(parsed_tweet.urls)) self.assertIsNotNone(parsed_tweet.hashtags) self.assertEqual(1, len(parsed_tweet.hashtags)) self.assertIsNotNone(parsed_tweet.mentions) self.assertEqual(1, len(parsed_tweet.mentions)) self.assertIsNone(parsed_tweet.reserved_words) self.assertIsNotNone(parsed_tweet.emojis) self.assertEqual(1, len(parsed_tweet.emojis)) self.assertEqual("😀", parsed_tweet.emojis[0].match) self.assertIsNotNone(parsed_tweet.smileys) self.assertEqual(1, len(parsed_tweet.smileys)) self.assertEqual(":)", parsed_tweet.smileys[0].match)
def test_parse(self): tweet = "A tweet with #hashtag :) @mention 😀 and http://github.com/s." p.set_options(p.OPT.URL, p.OPT.HASHTAG, p.OPT.MENTION, p.OPT.EMOJI, p.OPT.SMILEY) parsed_tweet = p.parse(tweet) self.assertIsNotNone(parsed_tweet.urls) self.assertEqual(1, len(parsed_tweet.urls)) self.assertIsNotNone(parsed_tweet.hashtags) self.assertEqual(1, len(parsed_tweet.hashtags)) self.assertIsNotNone(parsed_tweet.mentions) self.assertEqual(1, len(parsed_tweet.mentions)) self.assertIsNone(parsed_tweet.reserved_words) self.assertIsNotNone(parsed_tweet.emojis) self.assertEqual(1, len(parsed_tweet.emojis)) self.assertEqual("😀", parsed_tweet.emojis[0].match) self.assertIsNotNone(parsed_tweet.smileys) self.assertEqual(1, len(parsed_tweet.smileys)) self.assertEqual(":)", parsed_tweet.smileys[0].match)
tweet=[] hashlist = "" menlist = "" urllist = "" emolist = "" emolist1 = "" #jsonFile = open("dilmabr.json",'a') with open ("ArvindKejriwal.json",'r') as a: # y = json.loads(a) for line in a: content = json.loads(line) x= p.parse(content["text"]) if content["in_reply_to_screen_name"] != "null": content["poi_name"] = content["user"]["screen_name"] content["poi_id"] = content["user"]["id"] poiname = content["poi_name"] else: content["poi_name"] = content["in_reply_to_screen_name"] content["poi_id"] = content["in_reply_to_user_id"] poiname = content["poi_name"]
def get_usermentions(singletweet): return p.parse(singletweet).mentions
# -*- coding: utf-8 -*- import string import preprocessor as p from transliteration import getInstance text = "RT @Pritpal77777: @Gurmeetramrahim GURU JI!! 61210 peoples take pledge to leave their drug and bad deeds n to adopt path of righteousness i\xe2\x80\xa6" parsed_tweet = p.parse(text) print(parsed_tweet.hashtags, parsed_tweet.mentions, parsed_tweet.reserved_words) text = "ke liye best kar rahe hain to apki talash yahan par khatm hoti hai kyonki is post me main apko top video downloader apps ke baare me bataunga jinhe bahut hi aasani se install or use kiya jaa sakta ha." t = getInstance() t_text = t.transliterate(text, "hi_IN") print(t_text) # urls = None # emojis = None # smileys = None # hashtags = None # mentions = None # reserved_words = None
def prepare_data(folder, input_name2hourList, phase_category2keywords, output_flag): print "prepare_data()..." tweet_id2content = {} tweet_id_set = set() chunk2importance = {} chunk2IDF = {} #hashtag2tweet_num = {} mention2tweet_num = {} actor2tweet_num = {} actor2tweet_ids = {} actor2replied_actor = {} actor2geo = {} nlp = StanfordCoreNLP("../../tools/stanford-corenlp-full-2017-06-09") word2lemma = {} for input_name in input_name2hourList: input_file = open(folder + input_name, "r") hourList = input_name2hourList[input_name] for line in input_file: tweet = ast.literal_eval(line) if "body" in tweet and "actor_Username" in tweet: actor = "@" + tweet["actor_Username"] words = tweet["body"].split() if words[0][0] == "@": if actor not in actor2replied_actor: actor2replied_actor[actor] = [[ words[0], tweet["postedTime"] ]] else: actor2replied_actor[actor] += [[ words[0], tweet["postedTime"] ]] elif words[0] == "RT": if actor not in actor2replied_actor: actor2replied_actor[actor] = [[ words[1].replace(":", ""), tweet["postedTime"] ]] else: actor2replied_actor[actor] += [[ words[1].replace(":", ""), tweet["postedTime"] ]] if tweet_filter(tweet, hourList) == True: continue words = tweet["link"].split("/") tweet_id = words[3] + "_" + words[5] chunk_set, hashtag_set, mention_set = set(), set(), set() tweet_id_set.add(tweet_id) actor = "@" + words[3] if actor not in actor2tweet_num: actor2tweet_num[actor] = 1 else: actor2tweet_num[actor] += 1 if actor not in actor2tweet_ids: actor2tweet_ids[actor] = [tweet_id] else: actor2tweet_ids[actor] += [tweet_id] if "geo" in tweet: if actor not in actor2geo: actor2geo[actor] = [tweet["geo"]] else: actor2geo[actor] += [tweet["geo"]] parsed_tweet = p.parse(tweet["body"]) mentions = parsed_tweet.mentions hashtags = parsed_tweet.hashtags sentence_chunks = set() for chunk in tweet["chunkList"]: if chunk[0] == None: continue new_chunk = None if chunk[1] in word2lemma: new_chunk = word2lemma[chunk[1]] else: new_chunk = lemmatize(nlp, chunk[1]) word2lemma[chunk[1]] = new_chunk if new_chunk not in chunk2importance: chunk2importance[new_chunk] = list([chunk[2]]) else: chunk2importance[new_chunk] += [chunk[2]] sentence_chunks.add(new_chunk) chunk_set.add(new_chunk) for new_chunk in sentence_chunks: if new_chunk not in chunk2IDF: chunk2IDF[new_chunk] = 1.0 else: chunk2IDF[new_chunk] += 1.0 if hashtags != None: for hashtag in hashtags: tag = hashtag.match.lower() hashtag_set.add(tag) #if tag not in hashtag2tweet_num: # hashtag2tweet_num[tag] = 1 #else: # hashtag2tweet_num[tag] += 1 if mentions != None: for mention in mentions: m = mention.match mention_set.add(m) if m not in mention2tweet_num: mention2tweet_num[m] = 1 else: mention2tweet_num[m] += 1 if "geo" in tweet: tweet_id2content[tweet_id] = { "body": tweet["body"], "actor": actor, "chunks": chunk_set, "hashtags": hashtag_set, "mentions": mention_set, "geo": tweet["geo"], "postedTime": tweet["postedTime"] } else: tweet_id2content[tweet_id] = { "body": tweet["body"], "actor": actor, "chunks": chunk_set, "hashtags": hashtag_set, "mentions": mention_set, "postedTime": tweet["postedTime"] } input_file.close() nlp.close() total_doc = len(tweet_id_set) for chunk in chunk2IDF: chunk2IDF[chunk] = math.log(total_doc / chunk2IDF[chunk]) pickle.dump(tweet_id_set, open(output_flag + "tweet_id_set.p", "wb")) pickle.dump(chunk2importance, open(output_flag + "chunk2importance.p", "wb")) pickle.dump(chunk2IDF, open(output_flag + "chunk2IDF.p", "wb")) pickle.dump(tweet_id2content, open(output_flag + "tweet_id2content.p", "wb")) #pickle.dump(hashtag2tweet_num, open(output_flag + "hashtag2tweet_num.p", "wb")) pickle.dump(mention2tweet_num, open(output_flag + "mention2tweet_num.p", "wb")) pickle.dump(actor2tweet_num, open(output_flag + "actor2tweet_num.p", "wb")) pickle.dump(actor2tweet_ids, open(output_flag + "actor2tweet_ids.p", "wb")) pickle.dump(actor2replied_actor, open(output_flag + "actor2replied_actor.p", "wb")) pickle.dump(actor2geo, open(output_flag + "actor2geo.p", "wb")) for phase_category in phase_category2keywords: keywords = set(phase_category2keywords[phase_category]) category_tweet_id_set = set() for tweet_id in tweet_id2content: #if len(tweet_id2content[tweet_id]["chunks"] & keywords) != 0: cleaned_tweet_words = p.clean(tweet_id2content[tweet_id]["body"]) cleaned_tweet_words = process_text(cleaned_tweet_words) cleaned_tweet_words = set(cleaned_tweet_words.split()) if len(cleaned_tweet_words & keywords) != 0: category_tweet_id_set.add(tweet_id) pickle.dump(category_tweet_id_set, open(phase_category + "/tweet_id_set.p", "wb"))
splits.append(split_on_numbers[0]) else: splits.append(upper_split) tokenizer = TweetTokenizer() lemmatizer = WordNetLemmatizer() stemmer = SnowballStemmer('english') word_vectors = [] length_vector = [] index = 0 start = timeit.default_timer() text_file = open("cleanedTweets.txt", "w") for tweets in open('tweets.txt', encoding="utf-8"): p.set_options(p.OPT.HASHTAG) parsed_tweet = p.parse(tweets) hashtags = parsed_tweet.hashtags splits = [] if hashtags: for hashtag in hashtags: split_hashtags(hashtag) p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED, p.OPT.NUMBER, p.OPT.HASHTAG) stop_words = set(stopwords.words("english")) extra_stop_words = [ '.', '|', '+', '~', '✓', '︎', '“', "'", '—', '⠀', '-', ',', '•', '・', '_', '!', '&', ')', '(', '…', '️', ' ', '...', '"', '/', '?', '', '..', ':' ] for symbol in extra_stop_words: stop_words.add(symbol)
def extract_elements(tweets): """ Function that uses the tweet-preprocessor and emojis libraries to identify and save: - #Hashtags - @Mentions - Emojis Args: tweets: list containing all tweets Returns: hashtags: list of hashtags mentions: list of mentions emojis: list of emojis Sources: https://pypi.org/project/tweet-preprocessor/ https://github.com/carpedm20/emoji/ """ # set the global options for the library. These settings define which # elements of the tweet to pay attention to tp.set_options(tp.OPT.URL, tp.OPT.EMOJI, tp.OPT.MENTION, tp.OPT.NUMBER, tp.OPT.HASHTAG) # create empty lists to store the resuts hashtags = [] mentions = [] emojis = [] # iterate over all tweets in the list for tweet in tweets: # parse tweet to extract the relevant elements defined in the options parsed_tweet = tp.parse(tweet) # 1. save the hashtags h_tweet = [] if parsed_tweet.hashtags is not None: for hashtag in parsed_tweet.hashtags: h_tweet.append(hashtag.match) # save to the global list as a space separated string hashtags.append(' '.join(h_tweet)) # 2. save the emojis (using the library) e_tweet = [] if len(emoji.emoji_lis(tweet)) > 0: for e in emoji.emoji_lis(tweet): e_tweet.append(e['emoji']) # save to the global list as a space separated string emojis.append(' '.join(e_tweet)) # 3. save the mentions m_tweet = [] if parsed_tweet.mentions is not None: for mention in parsed_tweet.mentions: m_tweet.append(mention.match) # save to the global list as a space separated string mentions.append(' '.join(m_tweet)) return(hashtags, mentions, emojis)
def cleaning_sentence(sentence): hashtag_value = p.parse(sentence).hashtags sentence = p.clean(sentence) sentence = re.sub("[^A-Za-z .]+","", sentence) return sentence
def csv_to_json(filename, output_file): d = {} with open(os.path.join(DATA_DIR, filename), encoding="utf-8") as r: reader = csv.reader(r, delimiter=",", quotechar='"') next(reader, None) for row in reader: link, date, time, author_id, followers, following, *content_rows = row content = "" for row in content_rows: content += f" {row}" url, id_str = link.split("statuses/") hashtags = [] mentions = [] url_strings = [] parsed_tweet = p.parse(content) try: for hashtag in parsed_tweet.hashtags: hashtags.append(hashtag.match) except TypeError: pass try: for mention in parsed_tweet.mentions: mentions.append(mention.match) except TypeError: print(f"{author_id}: {content}") try: for url_string in parsed_tweet.urls: url_strings.append(url_string.match) except TypeError: pass d[id_str] = { 'id_str': id_str, 'date': date, 'text': content, # To do: tokenize tweet to get retweets 'retweet_count': 0, 'favorite_count': 0, 'reply_to': 0, 'coordinates': 0, 'reply_to_tweet': 0, 'user_screen_name': f"@{author_id}", 'quoted_status': 0, 'lang': 0, 'entities': 0, 'urls': url_strings, 'hashtags': hashtags, 'user_mentions': mentions, 'user': author_id, } with open(os.path.join(RESULTS_DIR, "converted_" + output_file + ".json"), 'w') as f: f.write(json.dumps(d, indent=1)) print("Success")
else: reply_text = tweet_text tweet_lang = tweet['lang'] hashtags = '' mentions = '' tweet_urls = '' tweet_emoticons = '' tweet_date = tweet['created_at'] tweet_loc = tweet['coordinates'] #if 'retweeted_status' in tweet: # retweeted_status = tweet['retweeted_status'] # isRetweet = True #else: # isRetweet = False parsedTweet = tpp.parse(tweet_text) splitTime = tweet_date.split() month = months[splitTime[1]] year = splitTime[5] day = splitTime[2] time = splitTime[3] #.split(':')[0]+':00:00' tweet_date = year + '-' + month + '-' + day + 'T' + time + 'Z' try: for temp in parsedTweet.hashtags: hashtags = hashtags + temp.match + ' ' hashtags = hashtags.replace('#', '').strip(' ') except TypeError as te: hashtags = '' try: for temp in parsedTweet.mentions: