def test_replace_with_desc(tweet): assert ( demoji.replace_with_desc(tweet, ":") == "#startspreadingthenews yankees win great start by :Santa Claus: medium-dark skin tone: going 5strong innings with 5k’s:fire: :ox:\nsolo homerun :volcano::volcano: with 2 solo homeruns and:ogre: 3run homerun… :clown face: :person rowing boat: medium-light skin tone: :man judge: medium skin tone: with rbi’s … :fire::fire:\n:flag: Mexico: and :flag: Nicaragua: to close the game:fire::fire:!!!….\nWHAT A GAME!!..\n" ) assert ( demoji.replace_with_desc(tweet, "|") == "#startspreadingthenews yankees win great start by |Santa Claus: medium-dark skin tone| going 5strong innings with 5k’s|fire| |ox|\nsolo homerun |volcano||volcano| with 2 solo homeruns and|ogre| 3run homerun… |clown face| |person rowing boat: medium-light skin tone| |man judge: medium skin tone| with rbi’s … |fire||fire|\n|flag: Mexico| and |flag: Nicaragua| to close the game|fire||fire|!!!….\nWHAT A GAME!!..\n" )
def _emojis(self, text, _demoji=False): try: if _demoji: text = demoji.replace_with_desc(text, "") shock_emoji = re.compile( "[" u"\ud83d\ude31" "]+", flags=re.UNICODE) text = shock_emoji.sub(r'EMONEG', text) # Smile -- :), : ), :-), (:, ( :, (-:, :') text = re.sub(r'(:\)|:-\)|\(\s:|\(-:|:\'\))', 'EMOPOS', text) # Laugh -- :D, : D, :-D, xD, x-D, XD, X-D, :-d, :d text = re.sub(r'(:D|:-D|x-D|X-D|:-d|:d)', 'EMOPOS', text) # Love -- <3, :* text = re.sub(r'(<3|:\*)', 'EMOPOS', text) # Wink -- ;-), ;), ;-D, ;D, (;, (-; text = re.sub(r'(;-\)|;-D|\(-;)', 'EMOPOS', text) # Sad -- :-(, : (, :(, ):, )-:, -_- text = re.sub(r'(:\(|:-\(|\)\s:|\)-:|-_-)', 'EMONEG', text) # Cry -- :,(, :'(, :"( text = re.sub(r'(:,\(|:\'\(|:"\()', 'EMONEG', text) # Shout -- :@ text = re.sub(r'(:\@)', 'EMONEG', text) text = self.__handle_coded_emojis(text) return text except Exception as e: print('PreProcessor Error => ', e) return " "
def preprocess_and_split_text(text): wordlist = [] text_without_emojis = demoji.replace_with_desc(text, sep=" ") words = text_without_emojis.split() for word in words: wordlist.append(word.lower()) return wordlist
def Emoji(self, text): if self.kwargs['emoji'] == 'stay': return text elif self.kwargs['emoji'] == 'remove': text_new = re.sub(emoji.get_emoji_regexp(), '', text) else: text_new = demoji.replace_with_desc(text).replace(':', ' ') return text_new
def run(self, event): file = open(self.dataEntry.get(), 'r') out = open('processed.txt', 'w') for line in file: out.write(demoji.replace_with_desc(line)) self.btn2.configure(state=tk.NORMAL)
def covert_emoji_to_text(self): """Convert emoji to text.""" pre_file = self.convert_dict() emoji_dict = {} for k, v in pre_file.items(): if isinstance(v, str) == True: emoji_text = demoji.replace_with_desc(v.lower()) emoji_text = self.preprocess1(emoji_text) emoji_text2 = str(emoji_text).translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) if self.isEnglish(emoji_text2) == True: emoji_dict[k] = emoji_text2 text_df = pd.DataFrame.from_dict(emoji_dict, orient='index') text_df['index'] = text_df.index text_df.columns = ['clean_comments', 'commentId'] text_df.to_csv(self.path + 'clean_comments.csv', encoding='utf-8-sig') return text_df
def convert(message_path): # Open JSON file containing downloaded twitch chat with open(message_path, encoding="utf-8") as read_file: data = json.load(read_file) filename = outputpath + data["video"]["user_name"] + \ " - " + data["video"]["id"] + ".txt" f = open(filename, "w", encoding='utf-8') print("Now converting: "+ filename,end="\n") for i in data["comments"]: try: b=known_bots.index(i["commenter"]["name"]) except ValueError: filtered = demoji.replace_with_desc(i["message"]["body"],sep = ":") if r"@" in filtered: continue else: f.write (filtered + "\n") else: continue
def translate_text(text, tgt_lang='en'): try: # Text to lower text = text.lower() # Replace Emojies to text text = demoji.replace_with_desc(text, "") # Remove extra emojies if any text = handle_emojis(text) # Clean Text to remove unwanted tokens text = clean_text(text) # translator Initiated translator = google_translator() # Text translated text = translator.translate(text, lang_tgt=tgt_lang) text = translator.translate(text, lang_tgt="en") # Check Auto Correct # spell = Speller() # text = spell(text) # returning text return text except Exception as e: print('Erorr => ', e) pass
def json_converter_nofilter(lang_folders, en_folders, de_folders, dest, dest_cleaned): for lang in lang_folders: if lang == "De": folders = de_folders else: folders = en_folders print("Started working, checking for missing files") # find dates that are missing, check for one folder in source because we assume all have the same dates avaialble files_source = [ k for k in os.listdir(os.path.join(path_comp, source, folders[0])) if ".json" in k ] # convert to datelist dates_source = [ re.search(r'\d{4}-\d{2}-\d{2}', file).group() for file in files_source ] # check which files already exist files_dest = [ k for k in os.listdir(os.path.join(path_comp, dest, folders[0])) if ".csv" in k ] files_dest_cleaned = [ k for k in os.listdir( os.path.join(path_comp, dest_cleaned, folders[0])) if ".csv" in k ] # inner join list and find last date that exists in both files_dest_both = list(set(files_dest) & set(files_dest_cleaned)) # convert to datelist dates_dest = [ re.search(r'\d{4}-\d{2}-\d{2}', file).group() for file in files_dest_both ] # find files in source but not dest dates_missing = list(set(dates_source) - set(dates_dest)) if len(dates_missing) == 0: print("No missing files found") # go thru all dates for date in dates_missing: # set up list tweets = [] #go into each folder folders an concat tweets to df for folder in folders: print(f"Working in {date} {folder}") # create filename from fodler name together wit date filename = f"{folder}_{date}.json" # create path path1 = os.path.join(path_comp, source, folder, filename) # load json files if filename in os.listdir( os.path.join(path_comp, source, folder)): for line in open(path1, 'r', encoding="utf8"): tweets.append(json.loads(line, parse_int=str)) # convert to df df = pd.DataFrame(tweets) # clean dataframe print("Cleaning df") df = df_cleaner(df, lang_controller=True, lang=lang) new_filename_csv = f"{lang}_NoFilter_{date}.csv" # save df print("Saving data") df.to_csv(os.path.join(path_comp, dest, f"{lang}_NoFilter", new_filename_csv), index=False) # now replace emojis and save in different destination df["tweet"] = df["tweet"].swifter.progress_bar(False).apply( lambda tweet: demoji.replace_with_desc(tweet, sep=" ")) # replace _ from emojis with " " df.tweet = df.tweet.str.replace("_", " ") #save df df.to_csv(os.path.join(path_comp, dest_cleaned, f"{lang}_NoFilter", new_filename_csv), index=False)
def json_converter_companies(source, dest, dest_cleaned, company_folders, subfolders): for subfolder in subfolders: print(f"Working on {subfolder}") new_dest = os.path.join(path_comp, dest, "Companies", subfolder) new_dest_cleaned = os.path.join(path_comp, dest_cleaned, "Companies2", subfolder) # create folder in new destination if it does not already exist if not os.path.exists(os.path.join(path_comp, new_dest)): os.mkdir(os.path.join(path_comp, new_dest)) if not os.path.exists(os.path.join(path_comp, new_dest_cleaned)): os.mkdir(os.path.join(path_comp, new_dest_cleaned)) # now go into each company folder in the source an concat files from same day together # for this need to check if files exist in both and control for it files_de = os.listdir( os.path.join(path_comp, source, "Companies_de", f"{subfolder}_de")) files_en = os.listdir( os.path.join(path_comp, source, "Companies_en", f"{subfolder}_en")) # get the dates available in both datasets dates_de = [ re.search(r'\d{4}-\d{2}-\d{2}', file).group() for file in files_de ] dates_en = [ re.search(r'\d{4}-\d{2}-\d{2}', file).group() for file in files_en ] dates_both_source = list(set(dates_de) & set(dates_en)) dates_all_source = list(set(dates_de + dates_en)) # now check which dates already exist at dest files_dest = os.listdir( os.path.join(path_comp, dest, "Companies", subfolder)) files_dest_cleaned = os.listdir( os.path.join(path_comp, dest_cleaned, "Companies2", subfolder)) # inner join files_dest_both = list(set(files_dest) & set(files_dest_cleaned)) # extract dates dates_exist = [ re.search(r'\d{4}-\d{2}-\d{2}', file).group() for file in files_dest_both ] # find all missing dates, in case on folder has more files than redo them again because quicker than accounting for it # and setting up separate loop dates_missing = list(set(dates_all_source) - set(dates_exist)) # find dates missing that exist in both sources dates_both_missing = [ k for k in dates_missing if k in dates_both_source ] if dates_both_missing == []: print("No files missing that exist in german and english folders") else: print( f"Moving on to files that only exist in both folders for {subfolder}" ) # now for each date available in both got thru both folders and concat files, then clean and save them for date in dates_both_missing: print(f"Working on {subfolder}, {date}") # go into englisch folder tweets = [] for folder in company_folders: file = f"{subfolder}_{date}_{folder.split('_')[1]}.json" path = os.path.join(path_comp, source, folder, f"{subfolder}_{folder.split('_')[1]}", file) for line in open(path, 'r', encoding="utf8"): tweets.append(json.loads(line, parse_int=str)) # convert to df df = pd.DataFrame(tweets) # clean df df = df_cleaner(df) # check if df still contains entries if len(df) > 0: # save df new_filename_csv = f"{subfolder}_{date}.csv" # save df print("Saving data in both") df.to_csv(os.path.join(path_comp, new_dest, new_filename_csv), index=False) ########### # now replace emojis and save in different destination ########### df["tweet"] = df["tweet"].swifter.progress_bar(False).apply( lambda tweet: demoji.replace_with_desc(tweet, sep=" ") ) # replace _ from emojis with " " df.tweet = df.tweet.str.replace("_", " ") #save df df.to_csv(os.path.join(path_comp, new_dest_cleaned, new_filename_csv), index=False) # now continue for dates not in both dates_de_only = list(set(dates_de) - set(dates_en)) # find missing dates for german only dates_de_only_missing = [ k for k in dates_de_only if k not in dates_exist ] if dates_de_only_missing == []: print("No files missing that exist in german folders only") else: print( f"Moving on to files that only exist in the german folder for {subfolder}" ) # only clean german files for date in dates_de_only_missing: print(f"Working on {subfolder}, {date}") tweets = [] file = f"{subfolder}_{date}_de.json" path = os.path.join(path_comp, source, "Companies_de", f"{subfolder}_de", file) for line in open(path, 'r', encoding="utf8"): tweets.append(json.loads(line, parse_int=str)) # convert to df df = pd.DataFrame(tweets) # clean df df = df_cleaner(df) # check if df still cotninas rows if len(df) > 0: # save df new_filename_csv = f"{subfolder}_{date}.csv" # save df print("Saving german data") df.to_csv(os.path.join(path_comp, new_dest, new_filename_csv), index=False) ########### # now replace emojis and save in different destination ########### df["tweet"] = df["tweet"].swifter.progress_bar(False).apply( lambda tweet: demoji.replace_with_desc(tweet, sep=" ") ) # replace _ from emojis with " " df.tweet = df.tweet.str.replace("_", " ") #save df df.to_csv(os.path.join(path_comp, new_dest_cleaned, new_filename_csv), index=False) # same for englisch only # now continue for dates not in both dates_en_only = list(set(dates_en) - set(dates_de)) # find missing dates_en_only_missing = [ k for k in dates_en_only if k not in dates_exist ] if dates_en_only_missing == []: print("No files missing that exist in english folders only") else: print( f"Moving on to files that only exist in the english folder for {subfolder}" ) # only clean german files for date in dates_en_only_missing: print(f"Working on {subfolder}, {date}") tweets = [] file = f"{subfolder}_{date}_en.json" path = os.path.join(path_comp, source, "Companies_en", f"{subfolder}_en", file) for line in open(path, 'r', encoding="utf8"): tweets.append(json.loads(line, parse_int=str)) # convert to df df = pd.DataFrame(tweets) # clean df df = df_cleaner(df) if len(df) > 0: # save df new_filename_csv = f"{subfolder}_{date}.csv" # save df print("Saving english data") df.to_csv(os.path.join(path_comp, new_dest, new_filename_csv), index=False) ########### # now replace emojis and save in different destination ########### df["tweet"] = df["tweet"].swifter.progress_bar(False).apply( lambda tweet: demoji.replace_with_desc(tweet, sep=" ") ) # replace _ from emojis with " " df.tweet = df.tweet.str.replace("_", " ") #save df df.to_csv(os.path.join(path_comp, new_dest_cleaned, new_filename_csv), index=False)
def analyze_tweets(feeling): tag_list = {} emoji_list = {} words[feeling] = [] lemmatized_tweets = {} tk = TweetTokenizer() lemmatizer = WordNetLemmatizer() # with open(tweets_path + "dataset_dt_" + feeling.lower() + "_test_60k.txt", 'r', encoding="utf8") as file: with open(tweets_path + "dataset_dt_" + feeling.lower() + "_60k.txt", 'r', encoding="utf8") as file: lines = file.readlines() print("Start Analyzing tweet. Feeling: ", feeling) for line in tqdm(lines): # build map for hashtag and remove from line if '#' in line: hashtags = re.findall(r"#(\w+)", line) for htag in hashtags: tag_list[htag] = tag_list.get(htag, 0) + 1 line = line.replace('#' + htag, '').replace('#', '') words[feeling].append(htag) # find, store and replace emoji from line ejs = [demoji.replace_with_desc(em, ":") for em in emojiNeg + emojiPos + othersEmoji + negemoticons + posemoticons if (em in line)] for e in ejs: emoji_list[e] = emoji_list.get(e, 0) + 1 line = line.replace(e, '') words[feeling].append(e) # replace slang from sentences slang_list = [s for s in slang_words.keys() if (s in line.split())] for s in slang_list: line = line.replace(s, slang_words[s]) # remove punctuation punct_list = [p for p in punctuation if (p in line)] for p in punct_list: line = line.replace(p, '') # remove USERNAME and URL line = line.replace('USERNAME', '').replace('URL', '').lower() # remove citations citations = re.findall(r"@(\w+)", line) for cit in citations: line = line.replace('@' + cit, '').replace('@', '') # tokenize sentence word_tokens = tk.tokenize(line) pos_line = pos_tagging(word_tokens) # lemmatize nouns, adjective, verbs for pos in pos_line: if pos[1] in ['j', 'n', 'v']: lemm_w = lemmatizer.lemmatize(pos[0], pos[1]) words[feeling].append(lemm_w) lemmatized_tweets[lemm_w] = lemmatized_tweets.get(lemm_w, 0) + 1 # display word cloud wordcloud_words = WordCloud(max_font_size=50, background_color="white", width=800, height=400).generate_from_frequencies( lemmatized_tweets) wordcloud_emoji = WordCloud(max_font_size=50, background_color="white", width=800, height=400).generate_from_frequencies( emoji_list) wordcloud_tag = WordCloud(max_font_size=50, background_color="white", width=800, height=400).generate_from_frequencies( tag_list) wordcloud_words.to_file("img/cloud_words_" + feeling + ".png") wordcloud_emoji.to_file("img/cloud_emoji_" + feeling + ".png") wordcloud_tag.to_file("img/cloud_tag_" + feeling + ".png") # Store emoji, tags and tweets for feeling emoji[feeling] = emoji_list tweets[feeling] = lemmatized_tweets tags[feeling] = tag_list
def main(participants: List[str], args: argparse.Namespace, part_df: pd.DataFrame): now = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') dirname = ''.join(['ss_', now]) log = not args.no_log if log and not os.path.isdir(dirname): os.makedirs(dirname) code_length = 3 participants_dict = {} while True: try: for p in participants: p_wants = part_df.loc[part_df[RSN_KEY] == p][REQUEST_KEY].values[0] participants_dict[p] = Person(p, dirname, args.gifts, code_length, p_wants) for participant_name in participants: P = participants_dict[participant_name] if not P.can_pick(): continue targets_needed = P.gifts - len(P.targets) potential_targets = set( filter(lambda u: participants_dict[u].can_be_picked(), participants)) - {P.name} targets = sample(potential_targets, targets_needed) for t in targets: P.targets.append(t) participants_dict[t].get_picked() if args.verbose: print(P.name, ">>>", t) break # Can occur with multiple gift assignments per person, creates impossible situations to resolve, just try again! except ValueError: continue if log: for participant_name in participants: P = participants_dict[participant_name] with open(P.filepath, 'w') as f: for t in P.targets: T = participants_dict[t] cohorts = [] for cohort_name in participants: if not cohort_name == participant_name and t in participants_dict[ cohort_name].targets: cohorts.append(cohort_name) try: s = ''.join([ 'You', ' & ' * (len(cohorts) > 0), ', '.join(cohorts), ' have ', t, bool(T.wants) * ', who wants: ', T.wants, '\n' ]) f.write(s) except UnicodeEncodeError as E: s = demoji.replace_with_desc(s) f.write(s) if args.graph: plt.axes() # arrow properties margin = 1.1 width = 0.01 head_width = 2.5 * width head_length = 2 * width shape = 'left' length_includes_head = False squeeze = 0.1 center_offset = 0.01 basis_0 = np.asarray([ 0 + squeeze, 0, 1 ]) # basis start and endpoints to transform later, in the z = 1 plane basis_1 = np.asarray([1 - squeeze, 0, 1]) # plt.xlim(-margin, margin) plt.ylim(-margin, margin) n = len(participants) THETA = np.linspace( 0, 2 * math.pi, n + 1)[:-1] # last value is equal to the zeroth, remove it X = list(map(lambda t: math.cos(t), THETA)) Y = list(map(lambda t: math.sin(t), THETA)) participant_coordinates = {} for x, y, pn in zip(X, Y, participants): P = participants_dict[pn] participant_coordinates[P.name] = (x, y) plt.text(x, y, P.code, horizontalalignment='center', verticalalignment='center') for participant_name in participants: P = participants_dict[participant_name] x0, y0 = participant_coordinates[P.name] for t in P.targets: x1, y1 = participant_coordinates[t] dx = x1 - x0 dy = y1 - y0 slope = dy / dx theta = math.atan( slope) + (dx <= 0) * math.pi # angle associated with line phi = theta - math.pi / 4 # angle associated with normal offset x_offset = center_offset * math.cos(phi) y_offset = center_offset * math.sin(phi) ROTATE = np.asarray([[math.cos(theta), math.sin(theta), 0], [-math.sin(theta), math.cos(theta), 0], [0, 0, 1]]) base_length = math.sqrt(dx**2 + dy**2) SCALE = np.asarray([[base_length, 0, 0], [0, base_length, 0], [0, 0, 1]]) TRANSLATION = np.asarray([[1, 0, 0], [0, 1, 0], [x0 + x_offset, y0 + y_offset, 1]]) TRANSFORMATION = np.matmul( np.matmul(ROTATE, SCALE), TRANSLATION) # full transformation in one matrix x0_prime, y0_prime = np.matmul( basis_0, TRANSFORMATION)[:-1] # Project back to 2D plane x1_prime, y1_prime = np.matmul(basis_1, TRANSFORMATION)[:-1] # dx_prime = x1_prime - x0_prime dy_prime = y1_prime - y0_prime plt.arrow(x0_prime, y0_prime, dx_prime, dy_prime, head_width=head_width, head_length=head_length, width=width, shape=shape, length_includes_head=length_includes_head) plt.show()
def demoji_token(token): f = [] for cuv in token: f.append(demoji.replace_with_desc(cuv)) return f
def demojify(fp: io.IOBase): for line in fp: print(replace_with_desc(line), end="")
def preprocess_and_split_text(text): text_without_emojis = demoji.replace_with_desc(text, sep=" ") result = text_without_emojis.split() return result
def replace_with_desc(text): return demoji.replace_with_desc(text, "<emoji>")
# save df new_filename_csv = f"{subfolder}_{date}.csv" # save df print("Saving data in both") df.to_csv(os.path.join(new_dest ,new_filename_csv), index = False) ########### # now replace emojis and save in different destination ########### df["tweet"] = df["tweet"].swifter.progress_bar(False).apply(lambda tweet: demoji.replace_with_desc(tweet, sep = " "))# replace _ from emojis with " " df.tweet = df.tweet.str.replace("_", " ") #save df df.to_csv(os.path.join(new_dest_cleaned,new_filename_csv), index = False) # now continue for dates not in both dates_de_only = list(set(dates_de) - set(dates_en)) # find missing dates for german only dates_de_only_missing = [k for k in dates_de_only if k not in dates_exist] if dates_de_only_missing == []: print("No files missing that exist in german folders only") else:
def process_file(file): offset_time = None while not offset_time: print() offset_time_input = input( "Please input the time for " + file + " that the Zoom call started as hours:minutes:seconds in military/24hr time (e.g. 16:32:40): " ) offset_time = re.match(r'\d{2}:\d{2}:\d{2}', offset_time_input) offset_time_group = offset_time.group() offset_time_delta = to_delta(offset_time_group + '.000') with open(output_folder + '/' + file[:-4] + '.vtt', 'w', encoding='utf-8') as vtt: vtt.write('WEBVTT' + '\n') with open(path + '/' + file, 'r', encoding='utf-8') as chat: storage = '' start_time = None end_time = None new_start_time = None for line in chat: demojied_line = demoji.replace_with_desc(line) split_line = demojied_line.split('\t') if len(split_line) > 3: print( 'Looks like there was a tab in the text, please check the output for accuracy.' ) time = re.findall(r'\d{2}:\d{2}:\d{2}', split_line[0]) if time: if len(time) > 1: print( 'There\'s an issue with the Zoom transcript timestamps' ) else: start_time = new_start_time start_time_delta = to_delta(time[0] + '.000') updated_start_delta = start_time_delta - offset_time_delta str_start_delta = str(updated_start_delta) if len(str_start_delta.split(':')[0]) < 2: str_start_delta = '0' + str_start_delta new_start_time = str_start_delta if start_time: updated_end_delta = updated_start_delta - timedelta( milliseconds=1) # updated_end_delta = end_time_delta - offset_time_delta str_end_delta = str(updated_end_delta) if len(str_end_delta.split(':')[0]) < 2: str_end_delta = '0' + str_end_delta end_time = str_end_delta vtt.write('\n' + start_time + '.000 --> ' + end_time[:-3] + '\n') vtt.write(storage) storage = ' '.join(split_line[1:]) else: vtt.write(''.join(split_line)) vtt.write('\n') end_time = datetime.time( datetime.strptime(new_start_time, '%H:%M:%S') + timedelta(milliseconds=2000)) vtt.write( str(new_start_time) + '.000 --> ' + str(end_time) + '.000\n') vtt.write(storage)
lambda tweet: emoji.demojize(tweet, delimiters=(" ", " "))) time2 = time.time() - time1 print(time2) #%% a = df[["id", "tweet", "tweet_n"]].head(1000) #%% with different package import demoji demoji.download_codes() #%% print(demoji.findall(text)) #%% print(demoji.replace_with_desc(text, sep="")) #%% take time time1 = time.time() df["tweet_n"] = df["tweet"].swifter.progress_bar(False).apply( lambda tweet: demoji.replace_with_desc(tweet, sep=" ")) time2 = time.time() - time1 print(time2) #%% replace _ from emojis with " " df.tweet_n = df.tweet_n.str.replace("_", " ") #%% replace emoticons from emot.emo_unicode import UNICODE_EMO, EMOTICONS import re