示例#1
0
文件: whatsapp.py 项目: novoid/Memacs
    def _handle_message(self, msg):
        """parse a single message row"""

        msg['number'] = '00' + msg['number'].split('@')[0]
        msg['name'] = self._numberdict.get(msg['number'],msg['number'])
        msg['verb'] = 'to' if msg['type'] else 'from'
        msg['type'] = 'OUTGOING' if msg['type'] else 'INCOMING'
        msg['handler'] = self._args.handler

        if msg['text']:
            if self._args.demojize:
                msg['text'] = emoji.demojize(msg['text'])

            if self._args.skip_emoji:
                msg['text'] = re.sub(emoji.get_emoji_regexp(), '', msg['text'])

        timestamp = datetime.datetime.fromtimestamp(msg['timestamp'] / 1000)

        properties = OrgProperties(data_for_hashing=json.dumps(msg))
        properties.add('NUMBER', msg['number'])
        properties.add('TYPE', msg['type'])

        output = self._args.output_format.format(**msg)

        if msg['text'] and not self._is_ignored(msg):
            self._writer.write_org_subitem(timestamp=OrgFormat.datetime(timestamp),
                                           output=output, properties=properties)
示例#2
0
def count_emojis(tweets, nb):
    e = emoji.get_emoji_regexp()

    emojis = []
    for x in tweets:
        match = e.search(x)
        if match:
            emojis.append(match.group())

    dfe = pd.DataFrame(emojis, columns=['text'])
    return dfe
示例#3
0
def deEmojify(inputString):
    """ Emojileri ve diğer güvenli olmayan karakterleri metinden kaldırır. """
    return get_emoji_regexp().sub(u'', inputString)
def deEmojify(inputString):
    return get_emoji_regexp().sub(u'', inputString)
示例#5
0
def deEmojify(inputString):  # removes emojis for safe string handling
    return get_emoji_regexp().sub(u'', inputString)
示例#6
0
    def DownloadIllusts(self, url: str, base_path: str) -> int:
        """ニコニコ静画作品ページURLからダウンロードする

        Notes:
            静画画像実体(リダイレクト先)
            http://seiga.nicovideo.jp/image/source?id={illust_id}
            静画情報(xml)
            http://seiga.nicovideo.jp/api/illust/info?id={illust_id}
            ユーザーネーム取得(xml)※user_idは静画情報に含まれる
            https://seiga.nicovideo.jp/api/user/info?id={user_id}

        Args:
            url (str): ニコニコ静画作品ページURL
            base_path (str): 保存先ディレクトリのベースとなるパス

        Returns:
            int: DL成功時0、スキップされた場合1、エラー時-1
        """

        illust_id = self.GetIllustId(url)
        author_id, illust_title = self.GetIllustInfo(illust_id)
        author_name = self.GetAuthorName(author_id)

        # パスに使えない文字をサニタイズする
        # TODO::サニタイズを厳密に行う
        regex = re.compile(r'[\\/:*?"<>|]')
        author_name = regex.sub("", author_name)
        author_name = emoji.get_emoji_regexp().sub("", author_name)
        author_id = int(author_id)
        illust_title = regex.sub("", illust_title)
        illust_title = emoji.get_emoji_regexp().sub("", illust_title)

        # 画像保存先パスを取得
        save_directory_path = self.MakeSaveDirectoryPath(
            author_name, author_id, illust_title, illust_id, base_path)
        sd_path = Path(save_directory_path)
        if save_directory_path == "":
            return -1

        # 画像直リンクを取得
        source_url = self.GetSourceURL(illust_id)
        if source_url == "":
            return -1

        # {作者名}ディレクトリ作成
        sd_path.parent.mkdir(parents=True, exist_ok=True)

        # ファイルが既に存在しているか調べる
        # 拡張子は実際にDLするまで分からない
        # そのため、対象フォルダ内にillust_idを含むファイル名を持つファイルが存在するか調べることで代用する
        name = sd_path.name
        pattern = "^.*\(" + str(illust_id) + "\).*$"
        same_name_list = [
            f for f in sd_path.parent.glob("**/*")
            if re.search(pattern, str(f))
        ]

        # 既に存在しているなら再DLしないでスキップ
        if same_name_list:
            name = same_name_list[0].name
            logger.info("Download seiga illust: " + name + " -> exist")
            return 1

        # 画像DL
        response = self.session.get(source_url, headers=self.headers)
        response.raise_for_status()

        # 拡張子取得
        ext = self.GetExtFromBytes(response.content)

        # ファイル名設定
        name = "{}{}".format(sd_path.name, ext)

        # {作者名}ディレクトリ直下に保存
        with Path(sd_path.parent / name).open(mode="wb") as fout:
            fout.write(response.content)
        logger.info("Download seiga illust: " + name + " -> done")

        return 0
def remove_emoji(tokens):
    """Function to remove emoji in text"""
    tokens = [emoji.get_emoji_regexp().sub(u'', ''.join(tokens))]
    return tokens
示例#8
0
def clean_emoji(text):
    text = emoji.get_emoji_regexp().sub(r'', text)
    text = re.sub(r'[\^:)(]', '', text)
    return text.strip()
示例#9
0
def de_emojify(input_string):
    # Remove emojis and other non-safe characters from string
    return get_emoji_regexp().sub(u'', input_string)
示例#10
0
def normalizefield(wodict):
    """Normalize dictionary of raw Endomondo data
    """
    if 'speed_avg' in wodict.keys():
        speed = float(wodict['speed_avg'])
        if speed != 0:
            pace_sec = 60 * 60 / speed
            res = time.gmtime(pace_sec)
            wodict['pace'] = time.strftime('%M:%S', res)
            wodict['speed'] = str(round(speed, 2))
        else:
            wodict['pace'] = '0'
            wodict['speed'] = '0'
    if 'speed_avg_kmh' in wodict.keys():
        speed = float(wodict['speed_avg_kmh'])
        if speed != 0:
            pace_sec = 60 * 60 / speed
            res = time.gmtime(pace_sec)
            wodict['pace'] = time.strftime('%M:%S', res)
            wodict['speed'] = str(round(speed, 2))
        else:
            wodict['pace'] = '0'
            wodict['speed'] = '0'
    if 'speed_kmh_avg' in wodict.keys():
        speed = float(wodict['speed_kmh_avg'])
        if speed != 0:
            pace_sec = 60 * 60 / speed
            res = time.gmtime(pace_sec)
            wodict['pace'] = time.strftime('%M:%S', res)
            wodict['speed'] = str(round(speed, 2))
        else:
            wodict['pace'] = '0'
            wodict['speed'] = '0'
    #    return normalized
    if 'speed_max' in wodict.keys():
        speed = float(wodict['speed_max'])
        wodict['speed_max'] = str(round(speed, 2))
    if 'speed_max_kmh' in wodict.keys():
        speed = float(wodict['speed_max_kmh'])
        wodict['speed_max'] = str(round(speed, 2))
    if 'speed_kmh_max' in wodict.keys():
        speed = float(wodict['speed_kmh_max'])
        wodict['speed_max'] = str(round(speed, 2))
    #    return normalized
    if 'duration' in wodict.keys():
        res = time.gmtime(float(wodict['duration']))
        dur = time.strftime('%H:%M:%S', res)
        wodict['duration'] = dur
    if 'duration_s' in wodict.keys():
        res = time.gmtime(float(wodict['duration_s']))
        dur = time.strftime('%H:%M:%S', res)
        wodict['duration'] = dur
    if 'duration_sec' in wodict.keys():
        res = time.gmtime(float(wodict['duration_sec']))
        dur = time.strftime('%H:%M:%S', res)
        wodict['duration'] = dur
    #    return normalized
    if 'sport' in wodict.keys():
        sp = wodict['sport']
        if isinstance(sp, int):
            try:
                wodict['sport'] = SPORTS[sp]
            except KeyError:
                wodict['sport'] = SPORTS[22]  #Unknown sport - 'Other'
        else:
            wodict['sport'] = sp.capitalize().replace('_', ' ')
    #    return normalized
    if 'distance' in wodict.keys():
        wodict['distance'] = str(round(float(wodict['distance']), 2))
    if 'distance_km' in wodict.keys():
        wodict['distance'] = str(round(float(wodict['distance_km']), 2))
    #    return normalized
    if 'start_time' in wodict.keys():
        tt = _to_python_time(wodict['start_time'])
        wodict['date'] = tt.date()
        wodict['time'] = tt.time()
        wodict['start_time'] = wodict['start_time']
    #    return normalized
    if 'message' in wodict.keys():
        wodict['message'] = emoji.get_emoji_regexp().sub(
            r'', wodict['message'])
    if 'ascent' in wodict.keys():
        wodict['ascend_m'] = wodict['ascent']
    if 'descent' in wodict.keys():
        wodict['descend_m'] = wodict['descent']

    #HEART RATE
    if 'heart_rate_avg' in wodict.keys():
        wodict['heart_rate_avg_bpm'] = wodict['heart_rate_avg']
    if 'heart_rate_max' in wodict.keys():
        wodict['heart_rate_max_bpm'] = wodict['heart_rate_max']
    if 'heart_rate_bpm_avg' in wodict.keys():
        wodict['heart_rate_avg_bpm'] = wodict['heart_rate_bpm_avg']
    if 'heart_rate_bpm_max' in wodict.keys():
        wodict['heart_rate_max_bpm'] = wodict['heart_rate_bpm_max']

    if 'cadence_avg' in wodict.keys():
        wodict['cadence_avg_rpm'] = wodict['cadence_avg']
    if 'cadence_max' in wodict.keys():
        wodict['cadence_max_rpm'] = wodict['cadence_max']

    #ALTITUDE
    if 'altitude_min' in wodict.keys():
        wodict['altitude_min_m'] = wodict['altitude_min']
    if 'altitude_max' in wodict.keys():
        wodict['altitude_max_m'] = wodict['altitude_max']
    if 'altitude_m_min' in wodict.keys():
        wodict['altitude_min_m'] = wodict['altitude_m_min']
    if 'altitude_m_max' in wodict.keys():
        wodict['altitude_max_m'] = wodict['altitude_m_max']

    if 'calories' in wodict.keys():
        wodict['calories_kcal'] = wodict['calories']
 def strip_emoji_icon(list_name):
     emoji_re = emoji.get_emoji_regexp()
     list_emoji_icon_re = re.compile(u"^" + emoji_re.pattern)
     return list_emoji_icon_re.sub(r"", list_name)
示例#12
0
## Code in this cell is essentially copy & pasted from Hadoop: PrepareWeiboCorpus.ipynb . 
import re, emoji
atMention_pattern  = re.compile(r'@([\u4e00-\u9fa5a-zA-Z0-9_-]{1,30})')
emoticons_pattern  = re.compile(r'\[([0-9a-zA-Z\u4e00-\u9fa5]+)\]')
topic_pattern      = re.compile(r'#([^#]+)#')
url_pattern        = re.compile(r'{LK}([a-zA-Z0-9]{5,10})')
emoji_pattern      = emoji.get_emoji_regexp()
whitespace_pattern = re.compile(r'\s+')
rtMention_pattern  = re.compile(r'^\s*@([\u4e00-\u9fa5a-zA-Z0-9_-]{1,30})\s*[::]\s*')
markers_pattern    = re.compile(r' \{[A-Z]{2}\} ')

def mask(content):
    '''This function replaces many tokens with special tokens.'''
    # "@李名扬: 哈喽❤️~你来看看{LK}3JKS2L 这个里面有没有 @郭德纲 说的那个#宝藏#^_^。我觉得  还可以!"
    #rt_at_user     = ''.join(rtMention_pattern.findall(content))
    masked_content = rtMention_pattern.sub('', content)
    # "哈喽❤️~你来看看{LK}3JKS2L 这个里面有没有 @郭德纲 说的那个#宝藏#^_^。我觉得  还可以!"
    masked_content = whitespace_pattern.sub(' {SP} ', masked_content) # Reserve natural whitespaces
    # "哈喽❤️~你来看看{LK}3JKS2L {SP} 这个里面有没有 {SP} @郭德纲 {SP} 说的那个#宝藏#^_^。我觉得 {SP} 还可以!"
    #links          = url_pattern.findall(masked_content)
    masked_content = url_pattern.sub(' {LK} ', masked_content)
    # "哈喽❤️~你来看看 {LK}  {SP} 这个里面有没有 {SP} @郭德纲 {SP} 说的那个#宝藏#^_^。我觉得 {SP} 还可以!"
    #usernames      = atMention_pattern.findall(masked_content)
    masked_content = atMention_pattern.sub(' {AT} ', masked_content)
    # "哈喽❤️~你来看看 {LK}  {SP} 这个里面有没有 {SP}  {AT}  {SP} 说的那个#宝藏#^_^。我觉得 {SP} 还可以!"
    masked_content = emoji_pattern.sub(r' \1 ', masked_content)
    # "哈喽 ❤️ ~你来看看 {LK}  {SP} 这个里面有没有 {SP}  {AT}  {SP} 说的那个#宝藏#^_^。我觉得 {SP} 还可以!"
    #topics         = topic_pattern.findall(masked_content)
    masked_content = topic_pattern.sub(' {TP} ', masked_content)
    # "哈喽 ❤️ ~你来看看 {LK}  {SP} 这个里面有没有 {SP}  {AT}  {SP} 说的那个 {TP} ^_^。我觉得 {SP} 还可以!"
    #emoticons      = emoticons_pattern.findall(masked_content)
示例#13
0
def checkemotion(request):

    global text

    context2 = {}

    factory = StemmerFactory()
    stemmer = factory.create_stemmer()

    with open('polls/stopwordsfix.csv', 'r') as file:
        stopwords = []
        for line in file:
            clear_line = line.replace("\n", '').strip()
            stopwords.append(clear_line)

    stopwords_list = []
    after_stopwords = []


    text = request.GET['teks_input']

    # cleaning process
    gas = text.strip()
    blob = clean_tweet(gas)
    print("Text Cleaning :", blob)

    # split text and emoticon
    em_split_emoji = emoji.get_emoji_regexp().split(blob)
    em_split_whitespace = [substr.split() for substr in em_split_emoji]
    em_split = functools.reduce(operator.concat, em_split_whitespace)
    strSplit = ' '.join(em_split)
    print("Text Split Emoticon and Text :", strSplit)

    # lowering case process
    lower_case = strSplit.lower()
    print("Text Lower Case :", lower_case)

    # convert emoticon process
    punctuationText = lower_case.translate(str.maketrans('', '', string.punctuation))
    tokenized_words = punctuationText.split()
    for tokenized_words_emoticon in tokenized_words:
        arrayTokenizingEmoticon = []
        arrayTokenizingEmoticon.append(tokenized_words_emoticon)
        with open('polls/EmojiCategory-People.csv', 'r', encoding='utf-8') as fileEmoticon:
            for lineEmoticon in fileEmoticon:
                clear_line_emoticon = lineEmoticon.replace("\n", '').strip()
                emoticon, convert = clear_line_emoticon.split(',')
                if emoticon in arrayTokenizingEmoticon:
                    # emoticon_detection.append(emoticon)
                    tokenized_words.append(convert)
                    print("Emoticon Convert :", emoticon, "to", convert)
    strEmoticonConvert = ' '.join(tokenized_words)
    print("Text Emoticon Convert :", strEmoticonConvert)

    # stemming process
    hasilStemmer = stemmer.stem(strEmoticonConvert)
    print("Text Stemming :", hasilStemmer)

    # stop words process
    punctuationText2 = hasilStemmer.translate(str.maketrans('', '', string.punctuation))
    tokenized_words2 = punctuationText2.split()
    for tokenized_words3 in tokenized_words2:
        if tokenized_words3 not in stopwords:
            stopwords_list.append(stopwords)
            after_stopwords.append(tokenized_words3)

    strTextFix = ' '.join(after_stopwords)
    print("Text After Stop Words : ", strTextFix)

    entryClean = strTextFix
    
    data = prepare_data()
    handle_command_line(NaiveBayes(data, prepare_vocab(data)))

    print(sentiment)

    context2["output"] = sentiment


    context2["output1"] = text
    return render(request, 'polls/checkemotion.html',context2)
示例#14
0
    def wrap_emotes(self, line):
        """
		Wraps all emoji characters in the line with a <span class="emoji"> element
		"""
        emoji_regexp = emoji.get_emoji_regexp()
        return emoji_regexp.sub(r'<span class="emoji">\1</span>', line)
示例#15
0
import logging
import numpy as np

from sklearn.manifold import TSNE
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer
from gensim.models import word2vec

from functools import lru_cache
from pymongo import MongoClient
from requests_oauthlib import OAuth1

cachedStopWords = stopwords.words("english")

emoji_re = emoji.get_emoji_regexp()

# Streaming


def authentify(config_file):
    config = cnfg.load(".twitter_config")

    auth = tweepy.OAuthHandler(config["consumer_key"],
                               config["consumer_secret"])

    auth.set_access_token(config["access_token"],
                          config["access_token_secret"])
    return auth

示例#16
0
def removeEmoji(line):
    line = emoji.get_emoji_regexp().sub(u'', line)
    return line
示例#17
0
def main():
    #Layout of lower frame of main window
    details_frame = [[
        FieldColumn("Sport: ", '-SPORT-'),
        FieldColumn("Date: ", '-DATE-'),
        FieldColumn("Time: ", '-STARTTIME-'),
        FieldColumn("Duration: ", '-DURATION-'),
        FieldColumn("Distance: ", '-DISTANCE-')
    ],
                     [
                         FieldColumn("Pace: ", '-PACE-'),
                         FieldColumn("Ascent: ", '-ASC-'),
                         FieldColumn("Descent: ", '-DESC-')
                     ],
                     [
                         sg.Frame('Note',
                                  [[sg.Text(key='-NOTE-', size=(180, 6))]])
                     ]]

    #List of labels for main table
    tabl_head = [
        'Date', 'Time', 'Type', 'Distance', 'Duration', 'Pace', 'Photos',
        'Note', 'Comments'
    ]
    #Fill data for main table (needed as placeholder to define size for initial layout)
    data = [[
        ' ' * 15, ' ' * 15, ' ' * 15, ' ' * 10, ' ' * 10, ' ' * 10, ' ' * 10,
        ' ' * 45, ' ' * 10
    ] for row in range(16)]

    #Main window layout
    layout = [[
        sg.FolderBrowse(target='-FOLDER-'),
        sg.Input(key='-FOLDER-', enable_events=True),
        sg.Submit(),
        sg.Button('Fetch Comments', key='-FETCH-'),
        sg.Exit()
    ],
              [
                  sg.Table(data,
                           headings=tabl_head,
                           justification='center',
                           select_mode='browse',
                           key='-DATA-',
                           num_rows=30,
                           enable_events=True,
                           bind_return_key=True,
                           max_col_width=100)
              ], [sg.Column(details_frame, expand_y=True, expand_x=True)]]

    window = sg.Window('EndoView', layout, size=(1320, 670), finalize=True)
    window['-DATA-'].bind('<Double-Button-1>', '+DBL+')
    window['-DATA-'].bind('<Return>', '+ENTER+')

    config = configparser.ConfigParser()
    config.read('endoview.ini')
    dd = {}
    max_workouts = 0

    try:
        if 'cache' in config['endoview']:
            folder_path = config['endoview']['BackupFolder']
            window['-FOLDER-'].update(folder_path)
            with open('cache.pkl', 'rb') as f:
                dd = pickle.load(f)
            max_workouts = len(dd)
            with open('index.pkl', 'rb') as f:
                indx = pickle.load(f)
            updatetable(data, dd, window)
    except:
        pass

    while True:  # Event Loop of main window
        event, values = window.read(timeout=100)  #trap for strange exception

        if event == sg.TIMEOUT_KEY:
            continue
        #print(event, values)
        if event == sg.WIN_CLOSED or event == 'Exit':
            break
        elif event == '-FETCH-':
            #test if endoworkouts.json file is present
            if os.path.isfile(folder_path + '/endoworkouts.json'):
                with open(folder_path + '/endoworkouts.json') as p:
                    comm = json.load(p)
            if comm is not None:
                updatecomments(dd, comm, indx)
            with open("cache.pkl", "wb") as write_file:
                pickle.dump(dd, write_file, pickle.HIGHEST_PROTOCOL)
            updatetable(data, dd, window)
        elif event == '-FOLDER-' or (event == 'Submit'
                                     and len(values['-FOLDER-']) > 0):
            folder_path = values['-FOLDER-']
            #test if endoworkouts.json file is present
            # if os.path.isfile(folder_path+'/endoworkouts.json'):
            #     with open(folder_path+'/endoworkouts.json') as p:
            #         dd = json.load(p)
            #     print('Loading endoworkouts.json')
            #     distance_key='distance_km'
            #     duration_key='duration'
            #     speed_avg_key='speed_avg'
            # else:
            dd, indx = loadfull(folder_path)
            max_workouts = len(dd)
            #print('Loading backup! ')
            # we have processed database in memory - let's write cache and create config file
            config = configparser.ConfigParser()
            config['endoview'] = {}
            config['endoview'][
                'Cache'] = 'Y'  #indicate that we have cached data
            config['endoview'][
                'BackupFolder'] = folder_path  #save location of Endomondo backup
            with open('endoview.ini', 'w') as configfile:
                config.write(configfile)
            #now store cache to file system
            with open("cache.pkl", "wb") as write_file:
                pickle.dump(dd, write_file, pickle.HIGHEST_PROTOCOL)
            with open("index.pkl", "wb") as write_file:
                pickle.dump(indx, write_file, pickle.HIGHEST_PROTOCOL)
            updatetable(data, dd, window)
        elif event == '-DATA-':
            try:
                workout = dd[values['-DATA-'][0]]
                window['-SPORT-'].update(workout.get('sport'))
                window['-DATE-'].update(workout.get('date'))
                window['-STARTTIME-'].update(workout.get('time'))
                window['-DURATION-'].update(workout.get('duration'))
                window['-DISTANCE-'].update(workout.get('distance'))
                window['-PACE-'].update(workout.get('pace'))
                window['-ASC-'].update(workout.get('ascend_m'))
                window['-DESC-'].update(workout.get('descend_m'))
                window['-NOTE-'].update(workout.get('message'))
            except (IndexError, KeyError) as err:
                print(err)
        elif event == '-DATA-+DBL+' or event == '-DATA-+ENTER+':
            try:
                #in case of double click or ENTER press on specific line - pop up detailed window
                workout = dd[values['-DATA-'][0]]  # selected workout
                #prepare layout for detailed window
                #define sizes of the details window TODO: bind to desktop size
                win2_width = 1100
                win2_height = 100
                WIN2_HEIGHT_MAX = 700

                windetails = [
                    [
                        FieldColumn("Sport: ", '-SPORT-',
                                    workout.get('sport')),
                        FieldColumn("Date: ", '-DATE-', workout.get('date')),
                        FieldColumn("Time: ", '-STARTTIME-',
                                    workout.get('time')),
                        FieldColumn("Duration: ", '-DURATION-',
                                    workout.get('duration')),
                        FieldColumn("Distance: ", '-DISTANCE-',
                                    workout.get('distance'))
                    ],
                    [
                        FieldColumn("Pace: ", '-PACE-', workout.get('pace')),
                        FieldColumn("Ascent: ", '-ASC-',
                                    workout.get('ascend_m')),
                        FieldColumn("Descent: ", '-DESC-',
                                    workout.get('descend_m')),
                        FieldColumn("Alt min: ", '-ALTMIN-',
                                    workout.get('altitude_min_m')),
                        FieldColumn("Alt max: ", '-ALTMAX-',
                                    workout.get('altitude_max_m'))
                    ],
                    [
                        FieldColumn("HR AVG: ", '-HAVG-',
                                    workout.get('heart_rate_avg_bpm')),
                        FieldColumn("HR MAX: ", '-HMAX-',
                                    workout.get('heart_rate_max_bpm')),
                        FieldColumn("Calories: ", '-CAL-',
                                    workout.get('calories_kcal')),
                        FieldColumn("Cad AVG: ", '-CADAVG-',
                                    workout.get('cadence_avg_rpm')),
                        FieldColumn("Cad MAX: ", '-CADMAX-',
                                    workout.get('cadence_max_rpm'))
                    ],
                    [
                        FieldColumn("Speed AVG: ", '-SPAVG-',
                                    workout.get('speed')),
                        FieldColumn("Speed MAX: ", '-SPMAX-',
                                    workout.get('speed_max')),
                    ]
                ]
                msg = workout.get('message')
                lennote = 0 if msg is None else len(
                    msg)  #find out length of text note
                if lennote > 0:  # if there is note in workout - add text field and fill it with note
                    #nlines = msg.count('\n')+1
                    lines = msg.split("\n")
                    nlines = 0
                    for oneline in lines:
                        nlines += int(
                            len(oneline) / 165
                        ) + 1  # text breaks at about 165 chars in average
                    nheight = int(lennote / 150) + 1
                    if nlines < nheight:
                        nlines = nheight
                    windetails += [[
                        sg.Frame('Note', [[
                            sg.Text(msg,
                                    key='-NOTE-',
                                    size=(int(win2_width / 8), nlines))
                        ]])
                    ]]
                    win2_height += nlines * 8 + 50  #extend height of the window

                #check if there are pictures posted to the workout and add layout to the window
                pict = workout.get('pictures')
                if pict is not None:
                    linewidth = 0
                    imgline = []
                    for i in range(0, len(pict)):
                        #  try:
                        try:
                            url = pict[i][1].get('picture')[0][0].get('url')
                            data, (imgwidth, imgheight) = get_img_data(
                                folder_path + '/' + url, first=True)
                        except KeyError:
                            url = pict[i].get('picture_file')
                            data, (imgwidth, imgheight) = get_img_data(
                                os.path.join(folder_path, 'Images',
                                             os.path.split(url)[1]),
                                first=True)
                        if linewidth + imgwidth > win2_width:
                            windetails += [imgline]
                            win2_height += imgheight + 50
                            imgline = []
                            linewidth = 0
                        imgline.append(
                            sg.Image(key='-IMAGE' + str(i) + '-', data=data))
                        linewidth += imgwidth
                    if imgline != []:
                        windetails += [imgline]
                        win2_height += imgheight + 50
                    # except Exception as err:
                    #     print("Images exception: ", err)
                    #     break

                #create comments section
                comm_num = workout.get('num_comments')
                if comm_num != '':
                    try:
                        comment = workout.get('ecomments').get('data')
                    except AttributeError:
                        comment = workout.get('comments').get('data')

                    for i in range(len(comment)):
                        comtext = comment[i]['text']
                        lines = comtext.split("\n")
                        nlines = 0
                        for oneline in lines:
                            nlines += int(
                                len(oneline) / 100
                            ) + 1  # text breaks at about 165 chars in average
                        #comh = int(len(comtext)/100)+1 #height of the comment cell to fit the comment
                        comh = nlines
                        frame_layout = [[
                            sg.Text(emoji.get_emoji_regexp().sub(
                                r'', comment[i]['from']['name']) + ':',
                                    size=(20, comh)),
                            sg.Text(emoji.get_emoji_regexp().sub(r'', comtext),
                                    size=(100, comh),
                                    pad=(0, 0))
                        ]]
                        windetails += frame_layout
                        win2_height += 28  #TODO: add height depending on comment height

                win2_height = WIN2_HEIGHT_MAX if win2_height > WIN2_HEIGHT_MAX else win2_height

                win2layout = [[
                    sg.Column(windetails,
                              scrollable=True,
                              vertical_scroll_only=True,
                              size=(win2_width, win2_height))
                ]]
                win2 = sg.Window('Workout detail',
                                 win2layout,
                                 finalize=True,
                                 modal=True)
                win2.bind('<Escape>', '+ESC+')
                win2.bind('<Return>', '+ENTER+')

                while True:  # Event Loop
                    ev2, val2 = win2.read(timeout=100)  #timeout for debugger
                    if ev2 == sg.TIMEOUT_KEY:
                        continue
                    if ev2 == sg.WIN_CLOSED or ev2 == '+ESC+' or ev2 == '+ENTER+':
                        break
                win2.close()
                del win2layout
                del win2
                del windetails
            except (IndexError, KeyError) as err:
                print(err)
                pass

    window.close()
示例#18
0
 def _del_emoji(self):
     for i in range(len(self.url_content)):
         self.url_content[i] = emoji.get_emoji_regexp().sub(
             u'', self.url_content[i])
     return self
示例#19
0
def clear_emojis(target: str) -> str:
    """ Removes all Emojis from provided string """
    return get_emoji_regexp().sub(u'', target)
示例#20
0
def fill_word_stats(db, u, criteria):
    tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
    own_tweets = []
    rturlcnt = Counter()
    urlcnt = Counter()
    urlpertw = []
    tagcnt = Counter()
    tagpertw = []
    rttags = []
    words = []
    url_to_name = []
    uname = u['screen_name'].lower()
    for t in get_user_tweets(db, u['id'], criteria, batch=1000):
        if 'retweeted_status' in t:
            if 'urls' in t['retweeted_status'] and t['retweeted_status'][
                    'urls'] is not None:
                rturlcnt += Counter(
                    urlparse(unshort_url(db, i)).netloc
                    for i in t['retweeted_status'].get('urls', []))
            if 'hashtags' in t['retweeted_status']:
                rttags.append(t['retweeted_status']['hashtags'])
        else:
            if 'urls' in t and t['urls'] is not None:
                urlcnt += Counter(
                    urlparse(unshort_url(db, i)).netloc for i in t['urls'])
                url_to_name.extend(
                    edit_distance(urlparse(unshort_url(db, i)).netloc, uname)
                    for i in t['urls'])
                urlpertw.append(len(t['urls']))
            else:
                urlpertw.append(0)
            if 'hashtags' in t:
                tagcnt += Counter(t['hashtags'])
                tagpertw.append(len(t['hashtags']))
            else:
                tagpertw.append(0)
            if 'text' in t:
                own_tweets.append({'text': t['text']})

    if verbose(): print " tokenize"
    words = [tknzr.tokenize(s) for s in itertext(iter(own_tweets))]
    wcounts = [len(s) for s in words]

    #tagfreq = Counter(t for s in tags for t in s)
    #tagpertw = [len(s) for s in tags]
    uniqtags = len(tagcnt)
    totaltags = sum(tagcnt.values())

    rttagfreq = Counter(t for s in rttags for t in s)
    uniqrtags = len(rttagfreq)
    totalrtags = sum(rttagfreq.values())

    if verbose(): print " wc"
    artcnt = 0
    proncnt = 0
    explcnt = 0
    loccnt = 0
    emocnt = 0
    emojicnt = 0
    tw = 0
    tuw = 0
    tu2w = 0
    t2w = 0
    ustat = [0] * 10
    wc = Counter()
    bifreq = Counter()
    capstweets = 0

    try:
        if verbose(): print "  words"
        capstweets = sum(1 if all(w.isupper() for w in s) else 0
                         for s in words)
        wc = Counter(w for s in words for w in s)
        bigrams = (get_bigrams(s) for s in words)
        twstat = (get_phrase_stats(s) for s in words)
        ustat = reduce(lambda x, y: tuple(map(operator.add, x, y)), twstat)
        if verbose(): print "  bigrams"
        bc = Counter(b for s in bigrams for b in s)
        if verbose(): print "  dicts"
        tuw = len(wc)
        tw = sum(wc.values())
        tu2w = len(bc)
        t2w = sum(bc.values())
        if verbose(): print "  freqs"
        bifreq = bc
        if verbose(): print "  pos"
        for w, i in wc.iteritems():
            wd = deaccent(w.lower())
            if wd in expletives: explcnt += i
            if wd in articles: artcnt += i
            if wd in pronouns: proncnt += i
            if is_location(wd): loccnt += i
            if wd in emoticons: emocnt += i
            if emoji.get_emoji_regexp().match(wd):
                emojicnt += i
                wc[w] = 0
            if wd in stopwords: wc[w] = 0  #do not count stopwords
            if wd in punctuation_chars: wc[w] = 0  #do not count punctuation
    except:
        pass

    seen_own = len(own_tweets)
    if seen_own == 0: seen_own = 1  #for division
    if verbose(): print " saving"
    u['total_words'] = tw
    if tw == 0: tw = 1  # avoid divzero
    u['min_wptw'] = min(wcounts) if len(wcounts) else 0
    u['avg_wptw'] = numpy.mean(wcounts) if len(wcounts) else 0
    u['med_wptw'] = numpy.median(wcounts) if len(wcounts) else 0
    u['std_wptw'] = numpy.std(wcounts) if len(wcounts) else 0
    u['unique_words'] = tuw
    u['lex_freq'] = 1.0 * tuw / tw
    u['total_bigrams'] = t2w
    if t2w == 0: t2w = 1  # avoid divzero
    u['unique_bigrams'] = tu2w
    u['bigram_lex_freq'] = 1.0 * tu2w / t2w
    u['articles'] = artcnt
    u['pronouns'] = proncnt
    u['expletives'] = explcnt
    u['locations'] = loccnt
    u['emoticons'] = emocnt
    u['emoji'] = emojicnt
    u['alltokens'] = ustat[0]
    u['all_caps_words'] = ustat[1]
    u['all_caps_words_pcnt'] = 100.0 * ustat[1] / tw
    u['all_caps_tweets'] = capstweets
    u['all_caps_tweets_pcnt'] = 100.0 * capstweets / seen_own
    u['all_nocaps_words'] = ustat[2]
    u['all_nocaps_words_pcnt'] = 100.0 * ustat[2] / tw
    u['punctuation_chars'] = ustat[3]
    u['total_chars'] = ustat[8]
    u['punctuation_pcnt'] = 100.0 * ustat[3] / max(ustat[8], 1)
    u['digit_chars'] = ustat[4]
    u['digit_pcnt'] = 100.0 * ustat[4] / max(ustat[8], 1)
    u['alpha_chars'] = ustat[5]
    u['alpha_pcnt'] = 100.0 * ustat[5] / max(ustat[8], 1)
    u['upper_chars'] = ustat[6]
    u['upper_pcnt'] = 100.0 * ustat[6] / max(ustat[8], 1)
    u['lower_chars'] = ustat[7]
    u['lower_pcnt'] = 100.0 * ustat[7] / max(ustat[8], 1)
    u['greek_chars'] = ustat[9]
    u['greek_pcnt'] = 100.0 * ustat[9] / max(ustat[8], 1)
    u['total_hashtags'] = totaltags
    u['hashtags_per_tw'] = {
        'min': min(tagpertw) if len(tagpertw) else None,
        'max': max(tagpertw) if len(tagpertw) else None,
        'avg': numpy.mean(tagpertw) if len(tagpertw) else None,
        'med': numpy.median(tagpertw) if len(tagpertw) else None,
        'std': numpy.std(tagpertw) if len(tagpertw) else None
    }
    u['uniq_hashtags'] = uniqtags
    u['total_rt_hashtags'] = totalrtags
    u['uniq_rt_hashtags'] = uniqrtags
    u['most_common_words'] = [{
        'word': i[0],
        'count': i[1]
    } for i in wc.most_common(500)]
    u['most_common_bigrams'] = [{
        'bigram': ' '.join(i[0]),
        'count': i[1]
    } for i in bifreq.most_common(500)]
    u['most_common_hashtags'] = [{
        'hashtag': i[0],
        'count': i[1]
    } for i in tagcnt.most_common(500)]
    u['most_common_rt_hashtags'] = [{
        'hashtag': i[0],
        'count': i[1]
    } for i in rttagfreq.most_common(500)]
    u['most_common_urls'] = [{
        'url': i[0],
        'count': i[1]
    } for i in urlcnt.most_common(500)]
    u['most_common_rt_urls'] = [{
        'url': i[0],
        'count': i[1]
    } for i in rturlcnt.most_common(500)]
    u['seen_urls'] = sum(urlcnt.values())
    u['urls_per_tw'] = {
        'min': min(urlpertw) if len(urlpertw) else None,
        'max': max(urlpertw) if len(urlpertw) else None,
        'avg': numpy.mean(urlpertw) if len(urlpertw) else None,
        'med': numpy.median(urlpertw) if len(urlpertw) else None,
        'std': numpy.std(urlpertw) if len(urlpertw) else None
    }
    u['avg_edit_distance'] = numpy.mean(url_to_name) if len(
        url_to_name) else None
示例#21
0
async def promote_usr(client, message):
    if message.chat.type in ["group", "supergroup"]:
        cmd = message.command
        custom_rank = ""
        chat_id = message.chat.id
        get_group = await client.get_chat(chat_id)
        can_promo = await is_sudoadmin(message)

        if can_promo:
            if message.reply_to_message:
                get_mem = await client.get_chat_member(
                    chat_id, message.reply_to_message.from_user.id)
                user_id = message.reply_to_message.from_user.id
                custom_rank = get_emoji_regexp().sub("", " ".join(cmd[1:]))

                if len(custom_rank) > 15:
                    custom_rank = custom_rank[:15]
            else:
                await msg(message, text="`reply to a user to promote`")
                await asyncio.sleep(5)
                await message.delete()
                return

            if user_id:
                try:
                    await client.promote_chat_member(
                        chat_id,
                        user_id,
                        can_change_info=True,
                        can_delete_messages=True,
                        can_restrict_members=True,
                        can_invite_users=True,
                        can_pin_messages=True,
                    )

                    await asyncio.sleep(2)
                    await client.set_administrator_title(
                        chat_id, user_id, custom_rank)
                    text = "**Promoted**\n"
                    text += f"User: [{get_mem.user.first_name}](tg://user?id={get_mem.user.id})\n"
                    text += f"(`{get_mem.user.id}`)\n"
                    text += f"Chat: `{get_group.title}` (`{chat_id}`)"
                    await msg(message, text=text)
                except UsernameInvalid:
                    await msg(message, text="`invalid username`")
                    await asyncio.sleep(5)
                    await message.delete()
                    return
                except PeerIdInvalid:
                    await msg(message, text="`invalid username or userid`")
                    await asyncio.sleep(5)
                    await message.delete()
                    return
                except UserIdInvalid:
                    await msg(message, text="`invalid userid`")
                    await asyncio.sleep(5)
                    await message.delete()
                    return

                except ChatAdminRequired:
                    await msg(message, text="`permission denied`")
                    await asyncio.sleep(5)
                    await message.delete()
                    return

                except Exception as e:
                    await msg(message, text=f"**Log:** `{e}`")
                    return

        else:
            await msg(message, text="`permission denied`")
            await asyncio.sleep(5)
            await message.delete()
    else:
        await message.delete()
示例#22
0
def remove_emoji(text):
    return emoji.get_emoji_regexp().sub(u'', text)
示例#23
0
def strip_emoji(text):
    new_text = re.sub(emoji.get_emoji_regexp(), r" ", text)
    return new_text
示例#24
0
def deEmojify(inputString):
    """ Remova emojis e outros caracteres não seguros da string """
    return get_emoji_regexp().sub("", inputString)
示例#25
0
async def promote_usr(client, message):
    if message.chat.type in ['group', 'supergroup']:
        cmd = message.command
        can_promo = await admin_check(message)
        if can_promo:
            try:
                if message.reply_to_message:
                    user_id = message.reply_to_message.from_user.id
                    custom_rank = get_emoji_regexp().sub('', ' '.join(cmd[1:]))
                else:
                    usr = await client.get_users(cmd[1])
                    custom_rank = get_emoji_regexp().sub('', ' '.join(cmd[2:]))
                    user_id = usr.id
            except IndexError:
                await message.delete()
                return

            if user_id:
                try:
                    await client.promote_chat_member(
                        message.chat.id,
                        user_id,
                        can_change_info=True,
                        can_delete_messages=True,
                        can_restrict_members=True,
                        can_invite_users=True,
                        can_pin_messages=True,
                    )

                    await asyncio.sleep(2)
                    await client.set_administrator_title(
                        message.chat.id, user_id, custom_rank,
                    )
                    await message.delete()
                except UsernameInvalid:
                    await edit_or_reply(message, text=tld('user_invalid'))
                    await asyncio.sleep(5)
                    await message.delete()
                    return
                except PeerIdInvalid:
                    await edit_or_reply(message, text=tld('peer_invalid'))
                    await asyncio.sleep(5)
                    await message.delete()
                    return
                except UserIdInvalid:
                    await edit_or_reply(message, text=tld('id_invalid'))
                    await asyncio.sleep(5)
                    await message.delete()
                    return

                except ChatAdminRequired:
                    await edit_or_reply(message, text=tld('denied_permission'))
                    await asyncio.sleep(5)
                    await message.delete()
                    return

        else:
            await edit_or_reply(message, text=tld('denied_permission'))
            await asyncio.sleep(5)
            await message.delete()
    else:
        await message.delete()
示例#26
0
def give_emoji_free_text(text):
    #Delete emoji
    return emoji.get_emoji_regexp().sub(r' ', text.decode('utf8'))
l = logging.getLogger(__name__)
l.setLevel(logging.DEBUG)

ls = logging.StreamHandler()
ls.setLevel(logging.DEBUG)
formatter = logging.Formatter('[%(levelname)s] %(asctime)s: %(message)s')
ls.setFormatter(formatter)
l.addHandler(ls)

retrieved_tweets_count = 0
failed_tweets_count = 0
start_time = datetime.now()
queue = Queue()
threads = []
emoji_regexp = emoji.get_emoji_regexp()
work = True
store = open(DOWNLOADED_TWEETS_PATH, 'a')

class UnknownTwitterEmojiException(Exception):
    pass

def process_tweets():
    while work:
        tweet = queue.get()['text'].replace('\n', ' ')
        extracted_emojis = emoji_regexp.findall(tweet)
        for extracted_emoji in extracted_emojis:
            tweet = tweet.replace(extracted_emoji, emoji.unicode_codes.UNICODE_EMOJI[extracted_emoji])

        store.write('{}\n'.format(tweet))
        store.flush()
示例#28
0
def give_emoji_free_text(text):
    return emoji.get_emoji_regexp().sub(r'', text)
示例#29
0
def simpleTokenize(text):

    # Do the no-brainers first

    splitPunctText = splitEdgePunct(text)

    splitPunctText = \
        ' '.join(emoji.get_emoji_regexp().split(splitPunctText))

    textLength = len(splitPunctText)

    # BTO: the logic here got quite convoluted via the Scala porting detour
    # It would be good to switch back to a nice simple procedural style like in the Python version
    # ... Scala is such a pain.  Never again.

    # Find the matches for subsequences that should be protected,
    # e.g. URLs, 1.0, U.N.K.L.E., 12:53

    bads = []
    badSpans = []
    for match in Protected.finditer(splitPunctText):

        # The spans of the "bads" should not be split.

        if match.start() != match.end():  # unnecessary?
            bads.append([splitPunctText[match.start():match.end()]])
            badSpans.append((match.start(), match.end()))

    # Create a list of indices to create the "goods", which can be
    # split. We are taking "bad" spans like
    #     List((2,5), (8,10))
    # to create
    #     List(0, 2, 5, 8, 10, 12)
    # where, e.g., "12" here would be the textLength
    # has an even length and no indices are the same

    indices = [0]
    for (first, second) in badSpans:
        indices.append(first)
        indices.append(second)
    indices.append(textLength)

    # Group the indices and map them to their respective portion of the string

    splitGoods = []
    for i in range(0, len(indices), 2):
        goodstr = splitPunctText[indices[i]:indices[i + 1]]
        splitstr = goodstr.strip().split(' ')
        splitGoods.append(splitstr)

    #  Reinterpolate the 'good' and 'bad' Lists, ensuring that
    #  additonal tokens from last good item get included

    zippedStr = []
    for i in range(len(bads)):
        zippedStr = addAllnonempty(zippedStr, splitGoods[i])
        zippedStr = addAllnonempty(zippedStr, bads[i])
    zippedStr = addAllnonempty(zippedStr, splitGoods[len(bads)])

    # BTO: our POS tagger wants "ur" and "you're" to both be one token.
    # Uncomment to get "you 're"
    # splitStr = []
    # for tok in zippedStr:
    #    splitStr.extend(splitToken(tok))
    # zippedStr = splitStr

    return zippedStr
示例#30
0
def deEmojify(inputString):
    """ Remove emojis and other non-safe characters from string """
    return get_emoji_regexp().sub(u'', inputString)
示例#31
0
def remove_emoji(inputString):
    """ Remove emojis and other non-safe characters from string """
    return re.sub("\s\s+", " ",
                  emoji.get_emoji_regexp().sub(u' ', inputString))
    ("g*d", "god"),
    ("s*x", "sex"),
    ("a*s", "ass"),
    ("a**hole", "asshole"),
    ("a***ole", "asshole"),
    ("a**", "ass"),
]

REGEX_REPLACER = []
for origin, new in WORDS_REPLACER:
    o1 = origin.replace("*", "\*")
    REGEX_REPLACER.append((re.compile(o1), new))
RE_SPACE = re.compile(r"\s")
RE_MULTI_SPACE = re.compile(r"\s+")

EMOJI_REGEXP = emoji.get_emoji_regexp()
UNICODE_EMOJI_MY = {}
for k, v in emoji.UNICODE_EMOJI_ALIAS.items():
    v = v.strip(':')
    v = v.replace('_', ' ')
    UNICODE_EMOJI_MY[k] = f" EMJ {v} "


def replace(match):
    return UNICODE_EMOJI_MY.get(match.group(0))


def my_demojize(string):
    return re.sub("\ufe0f", "", EMOJI_REGEXP.sub(replace, string))

示例#33
0
def _calc_emoji_offset(to_calc):
    emoticons = emoji.get_emoji_regexp().finditer(to_calc)
    return sum(len(e.group(0).encode('utf-16-le')) // 2 - 1 for e in emoticons)