def remove_unwanted(self,output_df): comments = pd.read_json(output_df.to_json(orient = 'records')) demoji.download_codes() comments['clean_comments'] = comments['commentString'].apply(lambda x: demoji.replace(x,"")) comments['language'] = 0 comments['clean_parent_comments'] = comments['parentCommentString'].apply(lambda x: demoji.replace(x,"")) comments['language'] = 0 comments['clean_video_title'] = comments['videoTitle'].apply(lambda x: demoji.replace(x,"")) comments['language'] = 0 return comments
def demojify_graph(graph): if isinstance(graph, pygal.Bar): if hasattr(graph, 'x_labels'): graph.x_labels = [demoji.replace(str(x)) for x in graph.x_labels] if hasattr(graph, 'y_labels'): graph.y_labels = [demoji.replace(str(y)) for y in graph.y_labels] if isinstance(graph, pygal.Pie): for i in range(len(graph.raw_series)): graph.raw_series[i][1]['title'] = demoji.replace( graph.raw_series[i][1]['title'])
def clean_tweet(tweet): tok = English() # Remove usernames, "RT" and Hash tweet = re.sub(r"(RT|[@*])(\w*)", " ", tweet) # Remove Hashtag tweet = re.sub(r"#(\w+)", " ", tweet) # Remove links in tweets tweet = re.sub(r"http\S+", " ", tweet) # We remove "#" and keep the tags tweet = re.sub( r"(\\n)|(\#)|(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])", "", tweet, ) tweet = re.sub(r"(<br\s*/><br\s*/>)|(\-)|(\/)", " ", tweet) # convert to lower case tweet = re.sub(r"[^a-zA-Z0-9]", " ", tweet.lower()) # Convert to lower case # Tweets are usually full of emojis. We need to remove them. tweet = demoji.replace(tweet, repl="") # Stop words don't meaning to tweets. They can be removed tweet_words = tok(tweet) clean_tweets = [] for word in tweet_words: if word.is_stop == False and len(word) > 1: clean_tweets.append(word.text.strip()) tweet = " ".join(clean_tweets) return tweet
def clean_tweets(data): new_data = data.copy() username_hash = r'[#@]\w+' punctuation = '[%s]+' % re.escape(string.punctuation) special_char = r'[^0-9a-zA-Z\s]+' number = r'[0-9]+' space = r'\s{2,}' space_begin_end = r'^\s+|\s+$' url = r'(https?|www):\/{1,}\w+\W+\w+\/{1,}\w+' char_ref = r'&\w+;' for i in range(len(new_data)): new_data[i] = re.sub(char_ref, ' ', new_data[i]) new_data[i] = re.sub(username_hash, '', new_data[i]) new_data[i] = re.sub(url, '', new_data[i]) new_data[i] = re.sub(punctuation, '', new_data[i]) new_data[i] = re.sub(number, '', new_data[i]) new_data[i] = re.sub(space_begin_end, '', new_data[i]) new_data[i] = re.sub(space, '', new_data[i]) new_data[i] = demoji.replace(new_data[i], '') new_data[i] = re.sub(special_char, '', new_data[i]) new_data[i] = new_data[i].replace('\n', ' ') new_data[i] = new_data[i].replace('\xa0', ' ') new_data[i] = new_data[i].strip().lower() return new_data
def is_many_reactions(self) -> bool: found_reactions = find_emojis_in_str(self.text) if any(is_disallowed_reaction(r) for r in found_reactions): return False return (REACTIONS_IN_SINGLE_MSG_LIMIT >= len(found_reactions) > 1 and not demoji.replace(self.text).strip())
def data_cleaner_tweets(column): pattern = re.compile('RT @[\w_]+: |https?:\/\/.*[\r\n]*|[^A-Za-z0-9]+') column = column.apply(lambda x: demoji.replace(str(x), '')) column = column.apply(lambda x: re.sub(pattern, ' ', str(x))) #column = column.apply(lambda x:re.sub('[^A-Za-z0-9]+',' ',x)) #column = column.apply(lambda x:re.sub('[^A-Za-z0-9]+',' ',x)) return column
def url_remove(text): # demojiでUnicode絵文字を除去 text = demoji.replace(text, '') # 正規表現でカスタム絵文字を除去 text = re.sub(r'<:\w*:\d*>', '', text) # 正規表現でメンションを除去 text = discord.utils.escape_markdown(text=text) text = text.replace('\*', '') text = text.replace('\_', '') text = text.replace('\~', '') text = text.replace('\|', '') text = text.replace('\`', '') text = text.replace('> ', '') text = discord.utils.escape_mentions(text=text) text = re.sub(r'<@!\d+>', '', text) text = re.sub(r'<@&\d+>', '', text) # 正規表現でチャンネルメンションを除去 text = re.sub(r'<#\d+>', '', text) # URL除去 if text.find('http') != -1: pattern = "https?://[\w/:%#\$&\?\(\)~\.=\+\-]+" url_list = re.findall(pattern, text) for item in url_list: text = text.remove(item) return text
def build_reply(request, reply_type): uploaded_link = request['uploaded_link'] submission = reddit.submission(request['submission_id']) nsfw_warning = '' if submission.over_18: nsfw_warning = '**NSFW** ' reply = f"###[{nsfw_warning}{config['DOWNLOAD_TEXT']}]({uploaded_link})" header = '' announcement = '' if request['type'] == 'comment' and reply_type == 'comment': announcement = config['ANNOUNCEMENT_COMMENT'] header = config['HEADER'] elif request['type'] == 'comment' and reply_type == 'message': announcement = config['ANNOUNCEMENT_PM'] reply = header + reply + announcement # Footer footer = config['FOOTER'] if request['sub'] in config['NO_FOOTER_SUBS']: footer = "" # Emojis if request['sub'] in config['NO_EMOJI_SUBS']: reply = demoji.replace(reply, "") return reply + reddit_tube_ad + footer
def get_df_analysis(spotipyUserAuth, tracks_df, segments=True, min_conf=0.5, min_dur=0.25, tempo=True, sections=False, beats=False, bars=False): ''' spotipyUserAuth : Spotipy auth object. playlist_id : playlist id segments and tempo: Default True. False if not needed min_conf: minimum confidence to include a segment (range 0-1) min_dur : minimum duration/length in secs to include a segment sections/beats/bars: Default False. True if needs to be returned Returns : a dict with key/value pairs for all tracks in the playlist Keys: name of track Value: list containing tempo and segment dataframe of the track (and sections/beats/bars if asked) ''' tracks_name = list(tracks_df['name']) tracks_id = list(tracks_df['id']) artists_name = list(tracks_df['artists_name']) # track_analysis returns a list of dictionary tracks_analysis = sc.get_tracks_analysis(spotipyUserAuth, tracks_id) df_analysis = {} for name_, artists_name_, track_analysis in zip(tracks_name, artists_name, tracks_analysis): # remove any special characters from name (they may cause issues in filenaming) name_ = re.sub(r'[*|><:"?/]|\\', "", name_) name_ = demoji.replace(name_) artists_name_ = re.sub(r'[*|><:"?/]|\\', "", artists_name_) name_ = name_ + '-' + artists_name_[:3] df_analysis[name_] = sc.get_segments(track_analysis, segments=segments, min_conf=min_conf, min_dur=min_dur, tempo=tempo, sections=sections, beats=beats, bars=bars) return df_analysis
def clean_text(text): ''' Removes mentions, non alphanumeric/twitter characters, and emojis ''' t = text.lower() t = remove_mentions(t) t = re.sub(non_alphanumeric_or_twitter_characters," ",t) return demoji.replace(t) # remove emojis with " "
def modify_name(name: str) -> str: if not demoji.last_downloaded_timestamp(): demoji.download_codes() name = demoji.replace(string=name, repl="") name = unicodedata.normalize("NFKC", name) name = iuliia.translate(source=name, schema=iuliia.WIKIPEDIA) return name.title()
def clean_tweets(tweet): tweet = tweet.lower() # String to lower case tweet = re.sub("@[A-Za-z0-9]+", "", tweet) # Remove mentions tweet = re.sub("http\S+", "", tweet) # Remove links tweet = re.sub(r'[^\w\s]', "", tweet) # Remove punctuation tweet = demoji.replace(tweet, "") # Remove emojis tweet = tweet.replace("\n", " ") # Remove new lines return tweet
def beautify_tweet(row): tweet_text = demoji.replace(row["tweet_text"],"") tweet_text = re.sub(r"(?:\@|https?\://)\S+", "", tweet_text) tweet_text = re.sub(r"#(\w+)","",tweet_text) tweet_text = re.sub("RT|,|[\W\d!@#$%&*:\/]*…|\*|&|>|<", "", tweet_text) tweet_text = re.sub("’|‘|”|“", "", tweet_text) tweet_text = re.sub("[-«\[\]\(\)\<\>\{\}»—\\\@\#\$\%\&\:\/]|[\.]+|[\!]+|[\?]+|\n", "", tweet_text) return tweet_text
def is_simple_emoji_or_textual_reaction(self) -> bool: if is_disallowed_reaction(self.text): return False if len(self.text) == 1 or self.text in TEXTUAL_REACTIONS: return True found_reactions = find_emojis_in_str(self.text) return len(found_reactions) == 1 and not demoji.replace( self.text).strip()
def de_emojify(self, text): """Removes emojis and some special characters from the text. :param text: Text that contains emoji """ logger.info("Removing emojis...") regrex_pattern = re.compile(pattern="[\u2069\u2066]+", flags=re.UNICODE) text = regrex_pattern.sub('', text) return demoji.replace(text, " ")
def processTweet(tweet): tweet = demoji.replace(tweet) #removes emoticons tweet = tweet.lower() #lowercase tweet = re.sub(r'((www\.[^\s]+)|(https?://[^\s]+))', '', tweet) #removes urls tweet = re.sub(r'@[^\s]+', '', tweet) #removes usernames tweet = re.sub(r'#([^\s]+)', r'\1', tweet) #removes hastags tweet = ''.join(ch for ch in tweet if ch not in set(punctuation)) # removes punctuation tweet = ''.join(ch for ch in tweet if ch not in ['…']) #removes elipses (not included in punctuation) tweet = word_tokenize(tweet) # tokenize string into list return [word for word in tweet if word not in _stopwords] # removes english stopwords
def preprocessing(i, file, userList): tweet_dic = json.loads(i) #date date = time.strftime( '%Y-%m-%dT%H:00:00Z', time.strptime(tweet_dic["created_at"], "%a %b %d %H:%M:%S +0000 %Y")) format_str = '%Y-%m-%dT%H:00:00Z' dt = datetime.strptime(date, format_str) final = dt + timedelta(hours=1) #reply_text, poi_id, poi_name tweet_dic['tweet_date'] = final.strftime(format_str) if ('full_text' in tweet_dic.keys()): print("tweet has full_text") elif ('text' in tweet_dic.keys()): print("has text instead of full_text") tweet_dic['full_text'] = tweet_dic['text'] else: print("No full_text or text") return if tweet_dic['in_reply_to_status_id'] is not None: #print("wkefnwlm") if tweet_dic['in_reply_to_screen_name'] not in userList: tweet_dic['poi_name'] = tweet_dic['user']['screen_name'] tweet_dic['poi_id'] = tweet_dic['user']['id'] else: tweet_dic['poi_name'] = tweet_dic['in_reply_to_screen_name'] tweet_dic['poi_id'] = tweet_dic['in_reply_to_user_id'] tweet_dic['reply_text'] = tweet_dic['full_text'] else: #print("welfnwelknfdwm") #print(tweet_dic) tweet_dic['poi_name'] = tweet_dic['user']['screen_name'] tweet_dic['poi_id'] = tweet_dic['user']['id'] tweet_dic['reply_text'] = None print(tweet_dic['poi_name']) #country screen_name = tweet_dic['poi_name'] tweet_dic['country'] = "India" if screen_name in india_list: tweet_dic['country'] = "India" elif screen_name in usa_list: tweet_dic['country'] = 'USA' elif screen_name in brazil_list: tweet_dic['country'] = 'brazil' else: print("error poi {}".format(tweet_dic['id'])) #text_xx #text_xx = "text_" + str(tweet_dic['lang']) full_text = tweet_dic['full_text'] tweet_dic['text_copy'] = demoji.replace(full_text) tweet_dic['tweet_emotions'] = list(demoji.findall(full_text).keys()) #time.sleep(1) json.dump(tweet_dic, file, ensure_ascii=False) #time.sleep(5) file.write("\n")
def process_tweets(tweets, stemming=False, stemmer=None, banned_accounts=[]): processed_tweets = [] for tweet in tweets: logging.info('Processing tweet: {}'.format(tweet['id'])) if 'complete_text' in tweet: tweet_txt = tweet['complete_text'] del tweet['complete_text'] else: tweet_txt = tweet['text'] if tweet['user']['screen_name'] in banned_accounts: continue # remove emojis, urls, mentions processed_txt = tw_preprocessor.clean(tweet_txt) processed_txt = demoji.replace(processed_txt).replace('\u200d️', '').strip() processed_txt = emoji.get_emoji_regexp().sub(u'', processed_txt) tokens = [token.lower() for token in wordpunct_tokenize(processed_txt)] if tweet['lang'] == 'es': stop_words = stopwords.words('spanish') punct_signs = ['.', '[', ']', ',', ';', ')', '),', '('] stop_words.extend(punct_signs) words = [token for token in tokens if token not in stop_words] if stemming: stemmers = [stemmer.stem(word) for word in words] processed_txt = ' '.join([ stem for stem in stemmers if stem.isalpha() and len(stem) > 1 ]) else: processed_txt = ' '.join(word for word in words) else: processed_txt = ' '.join([token for token in tokens]) tweet['text'] = processed_txt if 'sentiment' in tweet: tweet['sentiment_polarity'] = tweet['sentiment']['score'] del tweet['sentiment'] tweet['hashtags'] = [] for hashtag in tweet['entities']['hashtags']: tweet['hashtags'].append(hashtag['text']) #tweet['urls'] = [] #for url in tweet['entities']['urls']: # tweet['urls'].append(url['expanded_url']) tweet['mentions'] = [] for mention in tweet['entities']['user_mentions']: tweet['mentions'].append(mention['screen_name']) del tweet['entities'] tweet[ 'url'] = f"http://www.twitter.com/{tweet['user']['screen_name']}/status/{tweet['id']}" del tweet['user'] dt = datetime.strptime(tweet['created_at_date'], '%Y-%m-%d') tweet['month'] = dt.month tweet['year'] = dt.year tweet['week_month'] = f'{week_of_month(dt)}-{dt.month}' processed_tweets.append(tweet) return processed_tweets
async def say(ctx, *, arg: str): filename = f"{uuid.uuid4().hex}.wav" pruned = re.sub('(<:.*:\d*>)', '', arg).strip() cleaned = demoji.replace(pruned, "") if setup['azure']['max_chars'] == 0 or len( cleaned) <= setup['azure']['max_chars']: await speak(ctx, cleaned, filename, True) else: await ctx.send( f"The message is longer than {setup['azure']['max_chars']} characters" )
def remove_emoji(self): # emoji_pattern = re.compile( # u'(\U0001F1F2\U0001F1F4)|' # Macau flag # u'([\U0001F1E6-\U0001F1FF]{2})|' # flags # u'([\U0001F600-\U0001F64F])' # emoticons # "+", flags=re.UNICODE) # self.filename = emoji_pattern.sub('', self.filename) try: self.filename = demoji.replace(self.filename, '#') except IOError: demoji.download_codes()
def filter_tweets_coronavirus(orig_tweet_text, nphrases_dict, country): orig_text = orig_tweet_text.strip().lower() orig_text = re.sub("&|>|<"," ", orig_text) orig_text = orig_text.replace("\n", " ").replace("#","").replace("@", "").lower().strip() orig_text = demoji.replace(orig_text," ") for ph,ph_country in nphrases_dict.items(): orig_ph_pos = orig_text.find(ph) if country == ph_country or ph_country == "na": if find_word_in_str(orig_text, ph, True): return True return False
def remove_emoji(data): updated_data = {} import demoji demoji.download_codes() for key in list(data.keys()): new_lines = [] for line in data[key]: updated = demoji.replace(line, repl="").strip() if updated: new_lines.append(updated) updated_data[key] = new_lines return updated_data
def assign_country_tweets(orig_tweet_text, tweet_text, nphrases, phrases_country, phrases_country_freq, phrases_hashtag, eu_countries, coded_countries): hashtags = set() countries = defaultdict(int) text = tweet_text.lower().strip() final_country = set() orig_tweet_text = re.sub("&|>|<"," ", orig_tweet_text) orig_text = orig_tweet_text.replace("\n", " ").replace("#","").replace("@", "").lower().strip() orig_text = demoji.replace(orig_text," ") for ph in nphrases: ph_pos = text.find(ph) orig_ph_pos = orig_text.find(ph) if find_word_in_str(text, ph) or find_word_in_str(orig_text, ph, True): if phrases_hashtag.get(ph, None) is not None: hashtags.add(phrases_hashtag[ph]) #if phrases_country.get(ph, None) is not None: # if phrases_country[ph] == "eu" or phrases_country[ph] not in only_en_countries: # countries["eu"] += phrases_country_freq[ph] # else: # countries[phrases_country[ph]] += phrases_country_freq[ph] #for ph, ctry in coded_countries.items(): # ph_pos = text.find(ph) # orig_ph_pos = orig_text.find(ph) # if find_word_in_str(text, ph) or find_word_in_str(orig_text, ph, True): # final_country.add(ctry) if len(hashtags) == 0: hashtags = None #if len(countries) == 0 and len(final_country) == 0: # final_country = "eu" #else: # if len(final_country) > 0: # if "usa" in final_country: # final_country = "usa" # elif "uk" in final_country: # final_country = "uk" # else: # final_country = "eu" # else: # country_freq_max = 0 # if countries.get("usa", None) is not None: # final_country = "usa" # elif countries.get("uk", None) is not None: # final_country = "uk" # else: # final_country = "eu" #return [final_country, hashtags] return hashtags
def translate_from_bel(self, text): text = demoji.replace(text, " ") translator = Translator() lan = translator.detect(text).lang if lan != 'be': return text result = translator.translate(text, src='be', dest='ru') translated_text = result.text for whole, first_digit, second_digit in time_fix_re.findall( translated_text): translated_text = translated_text.replace( whole, "%s:%s" % (first_digit, second_digit)) return translated_text
def expand_urls(urls): request_base = "https://b6dkeerw62x81o1j.pro.urlex.org/json/" request_payload = "***".join(urls) new_request_payload = demoji.replace(request_payload) request_url = request_base + new_request_payload try: with urllib.request.urlopen(request_url) as response: data = response.read() return data except: return request_url
def obtain_data(self): if self.parser_type == 'electricity': url_template = 'https://www.epsdistribucija.rs/Dan_{{day}}_Iskljucenja.htm' for i in range(0, 4): url = url_template.replace('{{day}}', str(i)) self.make_request(url) self.parse_data() else: bus_route_url = os.environ.get('BUS_ROUTE_URL') feed = feedparser.parse(bus_route_url) tags_to_skip = [ 'Aktivne izmene na linijama', 'Planirane izmene', 'Informacija' ] current_changes = [] for post in feed.entries: for tag in post.tags: if tag.term not in tags_to_skip: bus_route_number = tag.term.split('a ')[1] bus = Bus.where('bus_route_number', '=', bus_route_number).first() content = self.get_data_for_bus(post.link) current_live_route_change = demoji.replace(content) if bus is None: bus = Bus() bus.bus_route_number = bus_route_number bus.save() data = {bus.id: []} data[bus.id].append(current_live_route_change) current_changes.append(data) for change in current_changes: for bus_id in change: changes_to_save = change[bus_id] bus = Bus.find(bus_id) saved_routes = bus.bus_route for saved_route in saved_routes: for change_to_save in changes_to_save: if hashlib.blake2b( saved_route.route_change.encode( 'utf-8')).hexdigest( ) == hashlib.blake2b( change_to_save.encode( 'utf-8')).hexdigest(): changes_to_save.remove(change_to_save) for change_to_save in changes_to_save: self.update_bus_route(BusRoute(), bus, change_to_save)
def get_folder_features(spotipyUserAuth, filsort_pl=None, pl_name_id=None): ''' Here, we will be using filtered and sorted output. Future edit should take user playlist names and id. spotipyUserAuth : Spotipy auth object. filsort_pl : Default None. Uses 4-tuple output from filtersort_playlist function. pl_name_id : Dafault None. In the case filsort_pl is not available, provide list of playlist name and id tuples Returns: a dict with key/value pairs for all playlists in the folder. Key : Name of the playlist (string) Value : pandas.DataFrame returned from get_playlist_features ''' folder_features = {} if filsort_pl is not None: for p in filsort_pl: # remove any special characters from name (they may cause issues in filenaming) pl_name = re.sub(r'[*|><:"?/]|\\', "", p[1]) pl_name = demoji.replace(pl_name) folder_features[pl_name] = get_playlist_features(spotipyUserAuth, playlist_id=p[2]) else: for p in pl_name_id: # remove any special characters from name (they may cause issues in filenaming) pl_name = re.sub(r'[*|><:"?/]|\\', "", p[0]) pl_name = demoji.replace(pl_name) folder_features[pl_name] = get_playlist_features(spotipyUserAuth, playlist_id=p[1]) return folder_features
def format_content_lite(content: str, users: dict = None, newlines: bool = True) -> str: """Format raw text content to recognizable HTML This is designated the lite function because some parts of Discord require special parsing rules. """ # Encode multiline codeblocks (```text```) content = re.sub(r'```+((?:[^`]*?\n)?(?:[\s\S]+))\n?```+', _encode_codeblock, content) # Encode URLs content = re.sub(r'(\b(?:(?:https?|ftp|file)://|www\.|ftp\.)(?:\([-a-zA-Z0' r'-9+&@#/%?=~_|!:,.\[\];]*\)|[-a-zA-Z0-9+&@#/%?=~_|!:,.' r'\[\];])*(?:\([-a-zA-Z0-9+&@#/%?=~_|!:,.\[\];]*\)|[-a-z' r'A-Z0-9+&@#/%=~_|$]))', _encode_url, content) # HTML-encode content content = html.escape(content) # Encode inline codeblocks (`text` or ``text``) content = re.sub(r'(``?)([^`]+)\1', _encode_inline_codeblock, content) # Encode mentions content = re.sub(r'((@everyone)|(@here)|(<@!?(\d+)>)|(<@((.{2,32}?)#\d{4}) \((\d+)\)>)|' r'(@((.{2,32}?)#\d{4}))|(<#\d+>)|(<#(.{1,100}?)>)|(<@&(\d+)>)|' r'(<@&(.{1,100}?)>))', _encode_mentions, content) # Encode escaped emojis content = re.sub(ESCAPED_EMOJI_PAT, _encode_emojis, content) # Process emojis (:text:) content = re.sub(EMOJI_REGEX, _process_emojis, content) # Process unicode emojis content = demoji.replace(content, lambda m: _process_unicode_emojis(m, 'emoji')) # Process bold (**text**) content = re.sub(r'\*\*((?:\\[\s\S]|[^\\])+?)\*\*(?!\*)', r'<b>\1</b>', content) # Process underline (__text__) content = re.sub(r'__((?:\\[\s\S]|[^\\])+?)__(?!_)', r'<u>\1</u>', content) # Process italic (*text* or _text_) content = re.sub(r'\b_((?:__|\\[\s\S]|[^\\_])+?)_\b|\*(?=\S)((?:\*\*|\\[\s\S]|\s+(?:\\[\s\S]|[^\s*\\]|\*\*)|' r'[^\s*\\])+?)\*(?!\*)', r'<i>\1\2</i>', content) # Process strike through (~~text~~) content = re.sub(r'~~(?=\S)((?:\\[\s\S]|~(?!~)|[^\s\\~]|\s+(?!~~))+?)~~', r'<s>\1</s>', content) return _format_content(content, users, newlines)
def download_submissions(subreddit, limit=1000): settings = {} with open('opts') as opts: for line in opts: name, setting = line.partition('=')[::2] settings[name.strip()] = setting.strip() reddit = praw.Reddit(client_id=settings['id'], client_secret=settings['secret'], user_agent=settings['user_agent']) d = { 'author': [], 'clicked': [], #'comments': [], 'created_utc': [], 'distinguished': [], 'edited': [], 'id': [], 'is_original_content': [], 'is_self': [], 'link_flair_template_id': [], 'link_flair_text': [], 'locked': [], 'name': [], 'num_comments': [], 'over_18': [], 'permalink': [], #'poll_data': [], 'score': [], 'selftext': [], 'spoiler': [], 'stickied': [], #'subreddit': [], 'title': [], 'upvote_ratio': [], 'url': [] } for submission in reddit.subreddit(subreddit).top('all', limit=limit): for key in d: val = getattr(submission, key, 'None') if isinstance(val, str): val = demoji.replace(val) val = re.sub('[^A-Za-z0-9]+', '', val) d[key].append(val) df = pd.DataFrame(data=d) df.to_csv('{}_dump.csv'.format(subreddit.strip()))
def preprocess(data): data = re.sub(r'(https?:\/\/\S+)','',str(data)) data = convert_lower_case(data) data = remove_media_omit(data) data = remove_punctuation(data) #remove comma seperately data = remove_apostrophe(data) data = remove_stop_words(data) data = convert_numbers(data) # data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one data = lemmatize(data) # data = remove_punctuation(data) data = demoji.replace(str(data).strip(),'') data = word_tokenize(str(data)) # data = spacy_preprocess(data) return data