def _handle_message(self, msg): """parse a single message row""" msg['number'] = '00' + msg['number'].split('@')[0] msg['name'] = self._numberdict.get(msg['number'],msg['number']) msg['verb'] = 'to' if msg['type'] else 'from' msg['type'] = 'OUTGOING' if msg['type'] else 'INCOMING' msg['handler'] = self._args.handler if msg['text']: if self._args.demojize: msg['text'] = emoji.demojize(msg['text']) if self._args.skip_emoji: msg['text'] = re.sub(emoji.get_emoji_regexp(), '', msg['text']) timestamp = datetime.datetime.fromtimestamp(msg['timestamp'] / 1000) properties = OrgProperties(data_for_hashing=json.dumps(msg)) properties.add('NUMBER', msg['number']) properties.add('TYPE', msg['type']) output = self._args.output_format.format(**msg) if msg['text'] and not self._is_ignored(msg): self._writer.write_org_subitem(timestamp=OrgFormat.datetime(timestamp), output=output, properties=properties)
def count_emojis(tweets, nb): e = emoji.get_emoji_regexp() emojis = [] for x in tweets: match = e.search(x) if match: emojis.append(match.group()) dfe = pd.DataFrame(emojis, columns=['text']) return dfe
def deEmojify(inputString): """ Emojileri ve diğer güvenli olmayan karakterleri metinden kaldırır. """ return get_emoji_regexp().sub(u'', inputString)
def deEmojify(inputString): return get_emoji_regexp().sub(u'', inputString)
def deEmojify(inputString): # removes emojis for safe string handling return get_emoji_regexp().sub(u'', inputString)
def DownloadIllusts(self, url: str, base_path: str) -> int: """ニコニコ静画作品ページURLからダウンロードする Notes: 静画画像実体(リダイレクト先) http://seiga.nicovideo.jp/image/source?id={illust_id} 静画情報(xml) http://seiga.nicovideo.jp/api/illust/info?id={illust_id} ユーザーネーム取得(xml)※user_idは静画情報に含まれる https://seiga.nicovideo.jp/api/user/info?id={user_id} Args: url (str): ニコニコ静画作品ページURL base_path (str): 保存先ディレクトリのベースとなるパス Returns: int: DL成功時0、スキップされた場合1、エラー時-1 """ illust_id = self.GetIllustId(url) author_id, illust_title = self.GetIllustInfo(illust_id) author_name = self.GetAuthorName(author_id) # パスに使えない文字をサニタイズする # TODO::サニタイズを厳密に行う regex = re.compile(r'[\\/:*?"<>|]') author_name = regex.sub("", author_name) author_name = emoji.get_emoji_regexp().sub("", author_name) author_id = int(author_id) illust_title = regex.sub("", illust_title) illust_title = emoji.get_emoji_regexp().sub("", illust_title) # 画像保存先パスを取得 save_directory_path = self.MakeSaveDirectoryPath( author_name, author_id, illust_title, illust_id, base_path) sd_path = Path(save_directory_path) if save_directory_path == "": return -1 # 画像直リンクを取得 source_url = self.GetSourceURL(illust_id) if source_url == "": return -1 # {作者名}ディレクトリ作成 sd_path.parent.mkdir(parents=True, exist_ok=True) # ファイルが既に存在しているか調べる # 拡張子は実際にDLするまで分からない # そのため、対象フォルダ内にillust_idを含むファイル名を持つファイルが存在するか調べることで代用する name = sd_path.name pattern = "^.*\(" + str(illust_id) + "\).*$" same_name_list = [ f for f in sd_path.parent.glob("**/*") if re.search(pattern, str(f)) ] # 既に存在しているなら再DLしないでスキップ if same_name_list: name = same_name_list[0].name logger.info("Download seiga illust: " + name + " -> exist") return 1 # 画像DL response = self.session.get(source_url, headers=self.headers) response.raise_for_status() # 拡張子取得 ext = self.GetExtFromBytes(response.content) # ファイル名設定 name = "{}{}".format(sd_path.name, ext) # {作者名}ディレクトリ直下に保存 with Path(sd_path.parent / name).open(mode="wb") as fout: fout.write(response.content) logger.info("Download seiga illust: " + name + " -> done") return 0
def remove_emoji(tokens): """Function to remove emoji in text""" tokens = [emoji.get_emoji_regexp().sub(u'', ''.join(tokens))] return tokens
def clean_emoji(text): text = emoji.get_emoji_regexp().sub(r'', text) text = re.sub(r'[\^:)(]', '', text) return text.strip()
def de_emojify(input_string): # Remove emojis and other non-safe characters from string return get_emoji_regexp().sub(u'', input_string)
def normalizefield(wodict): """Normalize dictionary of raw Endomondo data """ if 'speed_avg' in wodict.keys(): speed = float(wodict['speed_avg']) if speed != 0: pace_sec = 60 * 60 / speed res = time.gmtime(pace_sec) wodict['pace'] = time.strftime('%M:%S', res) wodict['speed'] = str(round(speed, 2)) else: wodict['pace'] = '0' wodict['speed'] = '0' if 'speed_avg_kmh' in wodict.keys(): speed = float(wodict['speed_avg_kmh']) if speed != 0: pace_sec = 60 * 60 / speed res = time.gmtime(pace_sec) wodict['pace'] = time.strftime('%M:%S', res) wodict['speed'] = str(round(speed, 2)) else: wodict['pace'] = '0' wodict['speed'] = '0' if 'speed_kmh_avg' in wodict.keys(): speed = float(wodict['speed_kmh_avg']) if speed != 0: pace_sec = 60 * 60 / speed res = time.gmtime(pace_sec) wodict['pace'] = time.strftime('%M:%S', res) wodict['speed'] = str(round(speed, 2)) else: wodict['pace'] = '0' wodict['speed'] = '0' # return normalized if 'speed_max' in wodict.keys(): speed = float(wodict['speed_max']) wodict['speed_max'] = str(round(speed, 2)) if 'speed_max_kmh' in wodict.keys(): speed = float(wodict['speed_max_kmh']) wodict['speed_max'] = str(round(speed, 2)) if 'speed_kmh_max' in wodict.keys(): speed = float(wodict['speed_kmh_max']) wodict['speed_max'] = str(round(speed, 2)) # return normalized if 'duration' in wodict.keys(): res = time.gmtime(float(wodict['duration'])) dur = time.strftime('%H:%M:%S', res) wodict['duration'] = dur if 'duration_s' in wodict.keys(): res = time.gmtime(float(wodict['duration_s'])) dur = time.strftime('%H:%M:%S', res) wodict['duration'] = dur if 'duration_sec' in wodict.keys(): res = time.gmtime(float(wodict['duration_sec'])) dur = time.strftime('%H:%M:%S', res) wodict['duration'] = dur # return normalized if 'sport' in wodict.keys(): sp = wodict['sport'] if isinstance(sp, int): try: wodict['sport'] = SPORTS[sp] except KeyError: wodict['sport'] = SPORTS[22] #Unknown sport - 'Other' else: wodict['sport'] = sp.capitalize().replace('_', ' ') # return normalized if 'distance' in wodict.keys(): wodict['distance'] = str(round(float(wodict['distance']), 2)) if 'distance_km' in wodict.keys(): wodict['distance'] = str(round(float(wodict['distance_km']), 2)) # return normalized if 'start_time' in wodict.keys(): tt = _to_python_time(wodict['start_time']) wodict['date'] = tt.date() wodict['time'] = tt.time() wodict['start_time'] = wodict['start_time'] # return normalized if 'message' in wodict.keys(): wodict['message'] = emoji.get_emoji_regexp().sub( r'', wodict['message']) if 'ascent' in wodict.keys(): wodict['ascend_m'] = wodict['ascent'] if 'descent' in wodict.keys(): wodict['descend_m'] = wodict['descent'] #HEART RATE if 'heart_rate_avg' in wodict.keys(): wodict['heart_rate_avg_bpm'] = wodict['heart_rate_avg'] if 'heart_rate_max' in wodict.keys(): wodict['heart_rate_max_bpm'] = wodict['heart_rate_max'] if 'heart_rate_bpm_avg' in wodict.keys(): wodict['heart_rate_avg_bpm'] = wodict['heart_rate_bpm_avg'] if 'heart_rate_bpm_max' in wodict.keys(): wodict['heart_rate_max_bpm'] = wodict['heart_rate_bpm_max'] if 'cadence_avg' in wodict.keys(): wodict['cadence_avg_rpm'] = wodict['cadence_avg'] if 'cadence_max' in wodict.keys(): wodict['cadence_max_rpm'] = wodict['cadence_max'] #ALTITUDE if 'altitude_min' in wodict.keys(): wodict['altitude_min_m'] = wodict['altitude_min'] if 'altitude_max' in wodict.keys(): wodict['altitude_max_m'] = wodict['altitude_max'] if 'altitude_m_min' in wodict.keys(): wodict['altitude_min_m'] = wodict['altitude_m_min'] if 'altitude_m_max' in wodict.keys(): wodict['altitude_max_m'] = wodict['altitude_m_max'] if 'calories' in wodict.keys(): wodict['calories_kcal'] = wodict['calories']
def strip_emoji_icon(list_name): emoji_re = emoji.get_emoji_regexp() list_emoji_icon_re = re.compile(u"^" + emoji_re.pattern) return list_emoji_icon_re.sub(r"", list_name)
## Code in this cell is essentially copy & pasted from Hadoop: PrepareWeiboCorpus.ipynb . import re, emoji atMention_pattern = re.compile(r'@([\u4e00-\u9fa5a-zA-Z0-9_-]{1,30})') emoticons_pattern = re.compile(r'\[([0-9a-zA-Z\u4e00-\u9fa5]+)\]') topic_pattern = re.compile(r'#([^#]+)#') url_pattern = re.compile(r'{LK}([a-zA-Z0-9]{5,10})') emoji_pattern = emoji.get_emoji_regexp() whitespace_pattern = re.compile(r'\s+') rtMention_pattern = re.compile(r'^\s*@([\u4e00-\u9fa5a-zA-Z0-9_-]{1,30})\s*[::]\s*') markers_pattern = re.compile(r' \{[A-Z]{2}\} ') def mask(content): '''This function replaces many tokens with special tokens.''' # "@李名扬: 哈喽❤️~你来看看{LK}3JKS2L 这个里面有没有 @郭德纲 说的那个#宝藏#^_^。我觉得 还可以!" #rt_at_user = ''.join(rtMention_pattern.findall(content)) masked_content = rtMention_pattern.sub('', content) # "哈喽❤️~你来看看{LK}3JKS2L 这个里面有没有 @郭德纲 说的那个#宝藏#^_^。我觉得 还可以!" masked_content = whitespace_pattern.sub(' {SP} ', masked_content) # Reserve natural whitespaces # "哈喽❤️~你来看看{LK}3JKS2L {SP} 这个里面有没有 {SP} @郭德纲 {SP} 说的那个#宝藏#^_^。我觉得 {SP} 还可以!" #links = url_pattern.findall(masked_content) masked_content = url_pattern.sub(' {LK} ', masked_content) # "哈喽❤️~你来看看 {LK} {SP} 这个里面有没有 {SP} @郭德纲 {SP} 说的那个#宝藏#^_^。我觉得 {SP} 还可以!" #usernames = atMention_pattern.findall(masked_content) masked_content = atMention_pattern.sub(' {AT} ', masked_content) # "哈喽❤️~你来看看 {LK} {SP} 这个里面有没有 {SP} {AT} {SP} 说的那个#宝藏#^_^。我觉得 {SP} 还可以!" masked_content = emoji_pattern.sub(r' \1 ', masked_content) # "哈喽 ❤️ ~你来看看 {LK} {SP} 这个里面有没有 {SP} {AT} {SP} 说的那个#宝藏#^_^。我觉得 {SP} 还可以!" #topics = topic_pattern.findall(masked_content) masked_content = topic_pattern.sub(' {TP} ', masked_content) # "哈喽 ❤️ ~你来看看 {LK} {SP} 这个里面有没有 {SP} {AT} {SP} 说的那个 {TP} ^_^。我觉得 {SP} 还可以!" #emoticons = emoticons_pattern.findall(masked_content)
def checkemotion(request): global text context2 = {} factory = StemmerFactory() stemmer = factory.create_stemmer() with open('polls/stopwordsfix.csv', 'r') as file: stopwords = [] for line in file: clear_line = line.replace("\n", '').strip() stopwords.append(clear_line) stopwords_list = [] after_stopwords = [] text = request.GET['teks_input'] # cleaning process gas = text.strip() blob = clean_tweet(gas) print("Text Cleaning :", blob) # split text and emoticon em_split_emoji = emoji.get_emoji_regexp().split(blob) em_split_whitespace = [substr.split() for substr in em_split_emoji] em_split = functools.reduce(operator.concat, em_split_whitespace) strSplit = ' '.join(em_split) print("Text Split Emoticon and Text :", strSplit) # lowering case process lower_case = strSplit.lower() print("Text Lower Case :", lower_case) # convert emoticon process punctuationText = lower_case.translate(str.maketrans('', '', string.punctuation)) tokenized_words = punctuationText.split() for tokenized_words_emoticon in tokenized_words: arrayTokenizingEmoticon = [] arrayTokenizingEmoticon.append(tokenized_words_emoticon) with open('polls/EmojiCategory-People.csv', 'r', encoding='utf-8') as fileEmoticon: for lineEmoticon in fileEmoticon: clear_line_emoticon = lineEmoticon.replace("\n", '').strip() emoticon, convert = clear_line_emoticon.split(',') if emoticon in arrayTokenizingEmoticon: # emoticon_detection.append(emoticon) tokenized_words.append(convert) print("Emoticon Convert :", emoticon, "to", convert) strEmoticonConvert = ' '.join(tokenized_words) print("Text Emoticon Convert :", strEmoticonConvert) # stemming process hasilStemmer = stemmer.stem(strEmoticonConvert) print("Text Stemming :", hasilStemmer) # stop words process punctuationText2 = hasilStemmer.translate(str.maketrans('', '', string.punctuation)) tokenized_words2 = punctuationText2.split() for tokenized_words3 in tokenized_words2: if tokenized_words3 not in stopwords: stopwords_list.append(stopwords) after_stopwords.append(tokenized_words3) strTextFix = ' '.join(after_stopwords) print("Text After Stop Words : ", strTextFix) entryClean = strTextFix data = prepare_data() handle_command_line(NaiveBayes(data, prepare_vocab(data))) print(sentiment) context2["output"] = sentiment context2["output1"] = text return render(request, 'polls/checkemotion.html',context2)
def wrap_emotes(self, line): """ Wraps all emoji characters in the line with a <span class="emoji"> element """ emoji_regexp = emoji.get_emoji_regexp() return emoji_regexp.sub(r'<span class="emoji">\1</span>', line)
import logging import numpy as np from sklearn.manifold import TSNE from nltk.corpus import stopwords from nltk.stem.porter import * from nltk.stem import WordNetLemmatizer from gensim.models import word2vec from functools import lru_cache from pymongo import MongoClient from requests_oauthlib import OAuth1 cachedStopWords = stopwords.words("english") emoji_re = emoji.get_emoji_regexp() # Streaming def authentify(config_file): config = cnfg.load(".twitter_config") auth = tweepy.OAuthHandler(config["consumer_key"], config["consumer_secret"]) auth.set_access_token(config["access_token"], config["access_token_secret"]) return auth
def removeEmoji(line): line = emoji.get_emoji_regexp().sub(u'', line) return line
def main(): #Layout of lower frame of main window details_frame = [[ FieldColumn("Sport: ", '-SPORT-'), FieldColumn("Date: ", '-DATE-'), FieldColumn("Time: ", '-STARTTIME-'), FieldColumn("Duration: ", '-DURATION-'), FieldColumn("Distance: ", '-DISTANCE-') ], [ FieldColumn("Pace: ", '-PACE-'), FieldColumn("Ascent: ", '-ASC-'), FieldColumn("Descent: ", '-DESC-') ], [ sg.Frame('Note', [[sg.Text(key='-NOTE-', size=(180, 6))]]) ]] #List of labels for main table tabl_head = [ 'Date', 'Time', 'Type', 'Distance', 'Duration', 'Pace', 'Photos', 'Note', 'Comments' ] #Fill data for main table (needed as placeholder to define size for initial layout) data = [[ ' ' * 15, ' ' * 15, ' ' * 15, ' ' * 10, ' ' * 10, ' ' * 10, ' ' * 10, ' ' * 45, ' ' * 10 ] for row in range(16)] #Main window layout layout = [[ sg.FolderBrowse(target='-FOLDER-'), sg.Input(key='-FOLDER-', enable_events=True), sg.Submit(), sg.Button('Fetch Comments', key='-FETCH-'), sg.Exit() ], [ sg.Table(data, headings=tabl_head, justification='center', select_mode='browse', key='-DATA-', num_rows=30, enable_events=True, bind_return_key=True, max_col_width=100) ], [sg.Column(details_frame, expand_y=True, expand_x=True)]] window = sg.Window('EndoView', layout, size=(1320, 670), finalize=True) window['-DATA-'].bind('<Double-Button-1>', '+DBL+') window['-DATA-'].bind('<Return>', '+ENTER+') config = configparser.ConfigParser() config.read('endoview.ini') dd = {} max_workouts = 0 try: if 'cache' in config['endoview']: folder_path = config['endoview']['BackupFolder'] window['-FOLDER-'].update(folder_path) with open('cache.pkl', 'rb') as f: dd = pickle.load(f) max_workouts = len(dd) with open('index.pkl', 'rb') as f: indx = pickle.load(f) updatetable(data, dd, window) except: pass while True: # Event Loop of main window event, values = window.read(timeout=100) #trap for strange exception if event == sg.TIMEOUT_KEY: continue #print(event, values) if event == sg.WIN_CLOSED or event == 'Exit': break elif event == '-FETCH-': #test if endoworkouts.json file is present if os.path.isfile(folder_path + '/endoworkouts.json'): with open(folder_path + '/endoworkouts.json') as p: comm = json.load(p) if comm is not None: updatecomments(dd, comm, indx) with open("cache.pkl", "wb") as write_file: pickle.dump(dd, write_file, pickle.HIGHEST_PROTOCOL) updatetable(data, dd, window) elif event == '-FOLDER-' or (event == 'Submit' and len(values['-FOLDER-']) > 0): folder_path = values['-FOLDER-'] #test if endoworkouts.json file is present # if os.path.isfile(folder_path+'/endoworkouts.json'): # with open(folder_path+'/endoworkouts.json') as p: # dd = json.load(p) # print('Loading endoworkouts.json') # distance_key='distance_km' # duration_key='duration' # speed_avg_key='speed_avg' # else: dd, indx = loadfull(folder_path) max_workouts = len(dd) #print('Loading backup! ') # we have processed database in memory - let's write cache and create config file config = configparser.ConfigParser() config['endoview'] = {} config['endoview'][ 'Cache'] = 'Y' #indicate that we have cached data config['endoview'][ 'BackupFolder'] = folder_path #save location of Endomondo backup with open('endoview.ini', 'w') as configfile: config.write(configfile) #now store cache to file system with open("cache.pkl", "wb") as write_file: pickle.dump(dd, write_file, pickle.HIGHEST_PROTOCOL) with open("index.pkl", "wb") as write_file: pickle.dump(indx, write_file, pickle.HIGHEST_PROTOCOL) updatetable(data, dd, window) elif event == '-DATA-': try: workout = dd[values['-DATA-'][0]] window['-SPORT-'].update(workout.get('sport')) window['-DATE-'].update(workout.get('date')) window['-STARTTIME-'].update(workout.get('time')) window['-DURATION-'].update(workout.get('duration')) window['-DISTANCE-'].update(workout.get('distance')) window['-PACE-'].update(workout.get('pace')) window['-ASC-'].update(workout.get('ascend_m')) window['-DESC-'].update(workout.get('descend_m')) window['-NOTE-'].update(workout.get('message')) except (IndexError, KeyError) as err: print(err) elif event == '-DATA-+DBL+' or event == '-DATA-+ENTER+': try: #in case of double click or ENTER press on specific line - pop up detailed window workout = dd[values['-DATA-'][0]] # selected workout #prepare layout for detailed window #define sizes of the details window TODO: bind to desktop size win2_width = 1100 win2_height = 100 WIN2_HEIGHT_MAX = 700 windetails = [ [ FieldColumn("Sport: ", '-SPORT-', workout.get('sport')), FieldColumn("Date: ", '-DATE-', workout.get('date')), FieldColumn("Time: ", '-STARTTIME-', workout.get('time')), FieldColumn("Duration: ", '-DURATION-', workout.get('duration')), FieldColumn("Distance: ", '-DISTANCE-', workout.get('distance')) ], [ FieldColumn("Pace: ", '-PACE-', workout.get('pace')), FieldColumn("Ascent: ", '-ASC-', workout.get('ascend_m')), FieldColumn("Descent: ", '-DESC-', workout.get('descend_m')), FieldColumn("Alt min: ", '-ALTMIN-', workout.get('altitude_min_m')), FieldColumn("Alt max: ", '-ALTMAX-', workout.get('altitude_max_m')) ], [ FieldColumn("HR AVG: ", '-HAVG-', workout.get('heart_rate_avg_bpm')), FieldColumn("HR MAX: ", '-HMAX-', workout.get('heart_rate_max_bpm')), FieldColumn("Calories: ", '-CAL-', workout.get('calories_kcal')), FieldColumn("Cad AVG: ", '-CADAVG-', workout.get('cadence_avg_rpm')), FieldColumn("Cad MAX: ", '-CADMAX-', workout.get('cadence_max_rpm')) ], [ FieldColumn("Speed AVG: ", '-SPAVG-', workout.get('speed')), FieldColumn("Speed MAX: ", '-SPMAX-', workout.get('speed_max')), ] ] msg = workout.get('message') lennote = 0 if msg is None else len( msg) #find out length of text note if lennote > 0: # if there is note in workout - add text field and fill it with note #nlines = msg.count('\n')+1 lines = msg.split("\n") nlines = 0 for oneline in lines: nlines += int( len(oneline) / 165 ) + 1 # text breaks at about 165 chars in average nheight = int(lennote / 150) + 1 if nlines < nheight: nlines = nheight windetails += [[ sg.Frame('Note', [[ sg.Text(msg, key='-NOTE-', size=(int(win2_width / 8), nlines)) ]]) ]] win2_height += nlines * 8 + 50 #extend height of the window #check if there are pictures posted to the workout and add layout to the window pict = workout.get('pictures') if pict is not None: linewidth = 0 imgline = [] for i in range(0, len(pict)): # try: try: url = pict[i][1].get('picture')[0][0].get('url') data, (imgwidth, imgheight) = get_img_data( folder_path + '/' + url, first=True) except KeyError: url = pict[i].get('picture_file') data, (imgwidth, imgheight) = get_img_data( os.path.join(folder_path, 'Images', os.path.split(url)[1]), first=True) if linewidth + imgwidth > win2_width: windetails += [imgline] win2_height += imgheight + 50 imgline = [] linewidth = 0 imgline.append( sg.Image(key='-IMAGE' + str(i) + '-', data=data)) linewidth += imgwidth if imgline != []: windetails += [imgline] win2_height += imgheight + 50 # except Exception as err: # print("Images exception: ", err) # break #create comments section comm_num = workout.get('num_comments') if comm_num != '': try: comment = workout.get('ecomments').get('data') except AttributeError: comment = workout.get('comments').get('data') for i in range(len(comment)): comtext = comment[i]['text'] lines = comtext.split("\n") nlines = 0 for oneline in lines: nlines += int( len(oneline) / 100 ) + 1 # text breaks at about 165 chars in average #comh = int(len(comtext)/100)+1 #height of the comment cell to fit the comment comh = nlines frame_layout = [[ sg.Text(emoji.get_emoji_regexp().sub( r'', comment[i]['from']['name']) + ':', size=(20, comh)), sg.Text(emoji.get_emoji_regexp().sub(r'', comtext), size=(100, comh), pad=(0, 0)) ]] windetails += frame_layout win2_height += 28 #TODO: add height depending on comment height win2_height = WIN2_HEIGHT_MAX if win2_height > WIN2_HEIGHT_MAX else win2_height win2layout = [[ sg.Column(windetails, scrollable=True, vertical_scroll_only=True, size=(win2_width, win2_height)) ]] win2 = sg.Window('Workout detail', win2layout, finalize=True, modal=True) win2.bind('<Escape>', '+ESC+') win2.bind('<Return>', '+ENTER+') while True: # Event Loop ev2, val2 = win2.read(timeout=100) #timeout for debugger if ev2 == sg.TIMEOUT_KEY: continue if ev2 == sg.WIN_CLOSED or ev2 == '+ESC+' or ev2 == '+ENTER+': break win2.close() del win2layout del win2 del windetails except (IndexError, KeyError) as err: print(err) pass window.close()
def _del_emoji(self): for i in range(len(self.url_content)): self.url_content[i] = emoji.get_emoji_regexp().sub( u'', self.url_content[i]) return self
def clear_emojis(target: str) -> str: """ Removes all Emojis from provided string """ return get_emoji_regexp().sub(u'', target)
def fill_word_stats(db, u, criteria): tknzr = TweetTokenizer(strip_handles=True, reduce_len=True) own_tweets = [] rturlcnt = Counter() urlcnt = Counter() urlpertw = [] tagcnt = Counter() tagpertw = [] rttags = [] words = [] url_to_name = [] uname = u['screen_name'].lower() for t in get_user_tweets(db, u['id'], criteria, batch=1000): if 'retweeted_status' in t: if 'urls' in t['retweeted_status'] and t['retweeted_status'][ 'urls'] is not None: rturlcnt += Counter( urlparse(unshort_url(db, i)).netloc for i in t['retweeted_status'].get('urls', [])) if 'hashtags' in t['retweeted_status']: rttags.append(t['retweeted_status']['hashtags']) else: if 'urls' in t and t['urls'] is not None: urlcnt += Counter( urlparse(unshort_url(db, i)).netloc for i in t['urls']) url_to_name.extend( edit_distance(urlparse(unshort_url(db, i)).netloc, uname) for i in t['urls']) urlpertw.append(len(t['urls'])) else: urlpertw.append(0) if 'hashtags' in t: tagcnt += Counter(t['hashtags']) tagpertw.append(len(t['hashtags'])) else: tagpertw.append(0) if 'text' in t: own_tweets.append({'text': t['text']}) if verbose(): print " tokenize" words = [tknzr.tokenize(s) for s in itertext(iter(own_tweets))] wcounts = [len(s) for s in words] #tagfreq = Counter(t for s in tags for t in s) #tagpertw = [len(s) for s in tags] uniqtags = len(tagcnt) totaltags = sum(tagcnt.values()) rttagfreq = Counter(t for s in rttags for t in s) uniqrtags = len(rttagfreq) totalrtags = sum(rttagfreq.values()) if verbose(): print " wc" artcnt = 0 proncnt = 0 explcnt = 0 loccnt = 0 emocnt = 0 emojicnt = 0 tw = 0 tuw = 0 tu2w = 0 t2w = 0 ustat = [0] * 10 wc = Counter() bifreq = Counter() capstweets = 0 try: if verbose(): print " words" capstweets = sum(1 if all(w.isupper() for w in s) else 0 for s in words) wc = Counter(w for s in words for w in s) bigrams = (get_bigrams(s) for s in words) twstat = (get_phrase_stats(s) for s in words) ustat = reduce(lambda x, y: tuple(map(operator.add, x, y)), twstat) if verbose(): print " bigrams" bc = Counter(b for s in bigrams for b in s) if verbose(): print " dicts" tuw = len(wc) tw = sum(wc.values()) tu2w = len(bc) t2w = sum(bc.values()) if verbose(): print " freqs" bifreq = bc if verbose(): print " pos" for w, i in wc.iteritems(): wd = deaccent(w.lower()) if wd in expletives: explcnt += i if wd in articles: artcnt += i if wd in pronouns: proncnt += i if is_location(wd): loccnt += i if wd in emoticons: emocnt += i if emoji.get_emoji_regexp().match(wd): emojicnt += i wc[w] = 0 if wd in stopwords: wc[w] = 0 #do not count stopwords if wd in punctuation_chars: wc[w] = 0 #do not count punctuation except: pass seen_own = len(own_tweets) if seen_own == 0: seen_own = 1 #for division if verbose(): print " saving" u['total_words'] = tw if tw == 0: tw = 1 # avoid divzero u['min_wptw'] = min(wcounts) if len(wcounts) else 0 u['avg_wptw'] = numpy.mean(wcounts) if len(wcounts) else 0 u['med_wptw'] = numpy.median(wcounts) if len(wcounts) else 0 u['std_wptw'] = numpy.std(wcounts) if len(wcounts) else 0 u['unique_words'] = tuw u['lex_freq'] = 1.0 * tuw / tw u['total_bigrams'] = t2w if t2w == 0: t2w = 1 # avoid divzero u['unique_bigrams'] = tu2w u['bigram_lex_freq'] = 1.0 * tu2w / t2w u['articles'] = artcnt u['pronouns'] = proncnt u['expletives'] = explcnt u['locations'] = loccnt u['emoticons'] = emocnt u['emoji'] = emojicnt u['alltokens'] = ustat[0] u['all_caps_words'] = ustat[1] u['all_caps_words_pcnt'] = 100.0 * ustat[1] / tw u['all_caps_tweets'] = capstweets u['all_caps_tweets_pcnt'] = 100.0 * capstweets / seen_own u['all_nocaps_words'] = ustat[2] u['all_nocaps_words_pcnt'] = 100.0 * ustat[2] / tw u['punctuation_chars'] = ustat[3] u['total_chars'] = ustat[8] u['punctuation_pcnt'] = 100.0 * ustat[3] / max(ustat[8], 1) u['digit_chars'] = ustat[4] u['digit_pcnt'] = 100.0 * ustat[4] / max(ustat[8], 1) u['alpha_chars'] = ustat[5] u['alpha_pcnt'] = 100.0 * ustat[5] / max(ustat[8], 1) u['upper_chars'] = ustat[6] u['upper_pcnt'] = 100.0 * ustat[6] / max(ustat[8], 1) u['lower_chars'] = ustat[7] u['lower_pcnt'] = 100.0 * ustat[7] / max(ustat[8], 1) u['greek_chars'] = ustat[9] u['greek_pcnt'] = 100.0 * ustat[9] / max(ustat[8], 1) u['total_hashtags'] = totaltags u['hashtags_per_tw'] = { 'min': min(tagpertw) if len(tagpertw) else None, 'max': max(tagpertw) if len(tagpertw) else None, 'avg': numpy.mean(tagpertw) if len(tagpertw) else None, 'med': numpy.median(tagpertw) if len(tagpertw) else None, 'std': numpy.std(tagpertw) if len(tagpertw) else None } u['uniq_hashtags'] = uniqtags u['total_rt_hashtags'] = totalrtags u['uniq_rt_hashtags'] = uniqrtags u['most_common_words'] = [{ 'word': i[0], 'count': i[1] } for i in wc.most_common(500)] u['most_common_bigrams'] = [{ 'bigram': ' '.join(i[0]), 'count': i[1] } for i in bifreq.most_common(500)] u['most_common_hashtags'] = [{ 'hashtag': i[0], 'count': i[1] } for i in tagcnt.most_common(500)] u['most_common_rt_hashtags'] = [{ 'hashtag': i[0], 'count': i[1] } for i in rttagfreq.most_common(500)] u['most_common_urls'] = [{ 'url': i[0], 'count': i[1] } for i in urlcnt.most_common(500)] u['most_common_rt_urls'] = [{ 'url': i[0], 'count': i[1] } for i in rturlcnt.most_common(500)] u['seen_urls'] = sum(urlcnt.values()) u['urls_per_tw'] = { 'min': min(urlpertw) if len(urlpertw) else None, 'max': max(urlpertw) if len(urlpertw) else None, 'avg': numpy.mean(urlpertw) if len(urlpertw) else None, 'med': numpy.median(urlpertw) if len(urlpertw) else None, 'std': numpy.std(urlpertw) if len(urlpertw) else None } u['avg_edit_distance'] = numpy.mean(url_to_name) if len( url_to_name) else None
async def promote_usr(client, message): if message.chat.type in ["group", "supergroup"]: cmd = message.command custom_rank = "" chat_id = message.chat.id get_group = await client.get_chat(chat_id) can_promo = await is_sudoadmin(message) if can_promo: if message.reply_to_message: get_mem = await client.get_chat_member( chat_id, message.reply_to_message.from_user.id) user_id = message.reply_to_message.from_user.id custom_rank = get_emoji_regexp().sub("", " ".join(cmd[1:])) if len(custom_rank) > 15: custom_rank = custom_rank[:15] else: await msg(message, text="`reply to a user to promote`") await asyncio.sleep(5) await message.delete() return if user_id: try: await client.promote_chat_member( chat_id, user_id, can_change_info=True, can_delete_messages=True, can_restrict_members=True, can_invite_users=True, can_pin_messages=True, ) await asyncio.sleep(2) await client.set_administrator_title( chat_id, user_id, custom_rank) text = "**Promoted**\n" text += f"User: [{get_mem.user.first_name}](tg://user?id={get_mem.user.id})\n" text += f"(`{get_mem.user.id}`)\n" text += f"Chat: `{get_group.title}` (`{chat_id}`)" await msg(message, text=text) except UsernameInvalid: await msg(message, text="`invalid username`") await asyncio.sleep(5) await message.delete() return except PeerIdInvalid: await msg(message, text="`invalid username or userid`") await asyncio.sleep(5) await message.delete() return except UserIdInvalid: await msg(message, text="`invalid userid`") await asyncio.sleep(5) await message.delete() return except ChatAdminRequired: await msg(message, text="`permission denied`") await asyncio.sleep(5) await message.delete() return except Exception as e: await msg(message, text=f"**Log:** `{e}`") return else: await msg(message, text="`permission denied`") await asyncio.sleep(5) await message.delete() else: await message.delete()
def remove_emoji(text): return emoji.get_emoji_regexp().sub(u'', text)
def strip_emoji(text): new_text = re.sub(emoji.get_emoji_regexp(), r" ", text) return new_text
def deEmojify(inputString): """ Remova emojis e outros caracteres não seguros da string """ return get_emoji_regexp().sub("", inputString)
async def promote_usr(client, message): if message.chat.type in ['group', 'supergroup']: cmd = message.command can_promo = await admin_check(message) if can_promo: try: if message.reply_to_message: user_id = message.reply_to_message.from_user.id custom_rank = get_emoji_regexp().sub('', ' '.join(cmd[1:])) else: usr = await client.get_users(cmd[1]) custom_rank = get_emoji_regexp().sub('', ' '.join(cmd[2:])) user_id = usr.id except IndexError: await message.delete() return if user_id: try: await client.promote_chat_member( message.chat.id, user_id, can_change_info=True, can_delete_messages=True, can_restrict_members=True, can_invite_users=True, can_pin_messages=True, ) await asyncio.sleep(2) await client.set_administrator_title( message.chat.id, user_id, custom_rank, ) await message.delete() except UsernameInvalid: await edit_or_reply(message, text=tld('user_invalid')) await asyncio.sleep(5) await message.delete() return except PeerIdInvalid: await edit_or_reply(message, text=tld('peer_invalid')) await asyncio.sleep(5) await message.delete() return except UserIdInvalid: await edit_or_reply(message, text=tld('id_invalid')) await asyncio.sleep(5) await message.delete() return except ChatAdminRequired: await edit_or_reply(message, text=tld('denied_permission')) await asyncio.sleep(5) await message.delete() return else: await edit_or_reply(message, text=tld('denied_permission')) await asyncio.sleep(5) await message.delete() else: await message.delete()
def give_emoji_free_text(text): #Delete emoji return emoji.get_emoji_regexp().sub(r' ', text.decode('utf8'))
l = logging.getLogger(__name__) l.setLevel(logging.DEBUG) ls = logging.StreamHandler() ls.setLevel(logging.DEBUG) formatter = logging.Formatter('[%(levelname)s] %(asctime)s: %(message)s') ls.setFormatter(formatter) l.addHandler(ls) retrieved_tweets_count = 0 failed_tweets_count = 0 start_time = datetime.now() queue = Queue() threads = [] emoji_regexp = emoji.get_emoji_regexp() work = True store = open(DOWNLOADED_TWEETS_PATH, 'a') class UnknownTwitterEmojiException(Exception): pass def process_tweets(): while work: tweet = queue.get()['text'].replace('\n', ' ') extracted_emojis = emoji_regexp.findall(tweet) for extracted_emoji in extracted_emojis: tweet = tweet.replace(extracted_emoji, emoji.unicode_codes.UNICODE_EMOJI[extracted_emoji]) store.write('{}\n'.format(tweet)) store.flush()
def give_emoji_free_text(text): return emoji.get_emoji_regexp().sub(r'', text)
def simpleTokenize(text): # Do the no-brainers first splitPunctText = splitEdgePunct(text) splitPunctText = \ ' '.join(emoji.get_emoji_regexp().split(splitPunctText)) textLength = len(splitPunctText) # BTO: the logic here got quite convoluted via the Scala porting detour # It would be good to switch back to a nice simple procedural style like in the Python version # ... Scala is such a pain. Never again. # Find the matches for subsequences that should be protected, # e.g. URLs, 1.0, U.N.K.L.E., 12:53 bads = [] badSpans = [] for match in Protected.finditer(splitPunctText): # The spans of the "bads" should not be split. if match.start() != match.end(): # unnecessary? bads.append([splitPunctText[match.start():match.end()]]) badSpans.append((match.start(), match.end())) # Create a list of indices to create the "goods", which can be # split. We are taking "bad" spans like # List((2,5), (8,10)) # to create # List(0, 2, 5, 8, 10, 12) # where, e.g., "12" here would be the textLength # has an even length and no indices are the same indices = [0] for (first, second) in badSpans: indices.append(first) indices.append(second) indices.append(textLength) # Group the indices and map them to their respective portion of the string splitGoods = [] for i in range(0, len(indices), 2): goodstr = splitPunctText[indices[i]:indices[i + 1]] splitstr = goodstr.strip().split(' ') splitGoods.append(splitstr) # Reinterpolate the 'good' and 'bad' Lists, ensuring that # additonal tokens from last good item get included zippedStr = [] for i in range(len(bads)): zippedStr = addAllnonempty(zippedStr, splitGoods[i]) zippedStr = addAllnonempty(zippedStr, bads[i]) zippedStr = addAllnonempty(zippedStr, splitGoods[len(bads)]) # BTO: our POS tagger wants "ur" and "you're" to both be one token. # Uncomment to get "you 're" # splitStr = [] # for tok in zippedStr: # splitStr.extend(splitToken(tok)) # zippedStr = splitStr return zippedStr
def deEmojify(inputString): """ Remove emojis and other non-safe characters from string """ return get_emoji_regexp().sub(u'', inputString)
def remove_emoji(inputString): """ Remove emojis and other non-safe characters from string """ return re.sub("\s\s+", " ", emoji.get_emoji_regexp().sub(u' ', inputString))
("g*d", "god"), ("s*x", "sex"), ("a*s", "ass"), ("a**hole", "asshole"), ("a***ole", "asshole"), ("a**", "ass"), ] REGEX_REPLACER = [] for origin, new in WORDS_REPLACER: o1 = origin.replace("*", "\*") REGEX_REPLACER.append((re.compile(o1), new)) RE_SPACE = re.compile(r"\s") RE_MULTI_SPACE = re.compile(r"\s+") EMOJI_REGEXP = emoji.get_emoji_regexp() UNICODE_EMOJI_MY = {} for k, v in emoji.UNICODE_EMOJI_ALIAS.items(): v = v.strip(':') v = v.replace('_', ' ') UNICODE_EMOJI_MY[k] = f" EMJ {v} " def replace(match): return UNICODE_EMOJI_MY.get(match.group(0)) def my_demojize(string): return re.sub("\ufe0f", "", EMOJI_REGEXP.sub(replace, string))
def _calc_emoji_offset(to_calc): emoticons = emoji.get_emoji_regexp().finditer(to_calc) return sum(len(e.group(0).encode('utf-16-le')) // 2 - 1 for e in emoticons)