Пример #1
0
  def remove_unwanted(self,output_df):
    comments = pd.read_json(output_df.to_json(orient = 'records'))
    demoji.download_codes()

    comments['clean_comments'] = comments['commentString'].apply(lambda x: demoji.replace(x,""))
    comments['language'] = 0

    comments['clean_parent_comments'] = comments['parentCommentString'].apply(lambda x: demoji.replace(x,""))
    comments['language'] = 0

    comments['clean_video_title'] = comments['videoTitle'].apply(lambda x: demoji.replace(x,""))
    comments['language'] = 0
    return comments
Пример #2
0
def demojify_graph(graph):
    if isinstance(graph, pygal.Bar):

        if hasattr(graph, 'x_labels'):
            graph.x_labels = [demoji.replace(str(x)) for x in graph.x_labels]

        if hasattr(graph, 'y_labels'):
            graph.y_labels = [demoji.replace(str(y)) for y in graph.y_labels]

    if isinstance(graph, pygal.Pie):

        for i in range(len(graph.raw_series)):
            graph.raw_series[i][1]['title'] = demoji.replace(
                graph.raw_series[i][1]['title'])
Пример #3
0
def clean_tweet(tweet):
    tok = English()
    # Remove usernames, "RT" and Hash
    tweet = re.sub(r"(RT|[@*])(\w*)", " ", tweet)
    # Remove Hashtag
    tweet = re.sub(r"#(\w+)", " ", tweet)
    # Remove links in tweets
    tweet = re.sub(r"http\S+", " ", tweet)
    # We remove "#" and keep the tags
    tweet = re.sub(
        r"(\\n)|(\#)|(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])",
        "",
        tweet,
    )
    tweet = re.sub(r"(<br\s*/><br\s*/>)|(\-)|(\/)", " ", tweet)
    # convert to lower case
    tweet = re.sub(r"[^a-zA-Z0-9]", " ",
                   tweet.lower())  # Convert to lower case
    # Tweets are usually full of emojis. We need to remove them.
    tweet = demoji.replace(tweet, repl="")
    # Stop words don't meaning to tweets. They can be removed
    tweet_words = tok(tweet)
    clean_tweets = []
    for word in tweet_words:
        if word.is_stop == False and len(word) > 1:
            clean_tweets.append(word.text.strip())

    tweet = " ".join(clean_tweets)

    return tweet
Пример #4
0
def clean_tweets(data):
    new_data = data.copy()

    username_hash = r'[#@]\w+'
    punctuation = '[%s]+' % re.escape(string.punctuation)
    special_char = r'[^0-9a-zA-Z\s]+'
    number = r'[0-9]+'
    space = r'\s{2,}'
    space_begin_end = r'^\s+|\s+$'
    url = r'(https?|www):\/{1,}\w+\W+\w+\/{1,}\w+'
    char_ref = r'&\w+;'

    for i in range(len(new_data)):
        new_data[i] = re.sub(char_ref, ' ', new_data[i])
        new_data[i] = re.sub(username_hash, '', new_data[i])
        new_data[i] = re.sub(url, '', new_data[i])
        new_data[i] = re.sub(punctuation, '', new_data[i])
        new_data[i] = re.sub(number, '', new_data[i])
        new_data[i] = re.sub(space_begin_end, '', new_data[i])
        new_data[i] = re.sub(space, '', new_data[i])
        new_data[i] = demoji.replace(new_data[i], '')
        new_data[i] = re.sub(special_char, '', new_data[i])
        new_data[i] = new_data[i].replace('\n', ' ')
        new_data[i] = new_data[i].replace('\xa0', ' ')
        new_data[i] = new_data[i].strip().lower()

    return new_data
Пример #5
0
    def is_many_reactions(self) -> bool:
        found_reactions = find_emojis_in_str(self.text)
        if any(is_disallowed_reaction(r) for r in found_reactions):
            return False

        return (REACTIONS_IN_SINGLE_MSG_LIMIT >= len(found_reactions) > 1
                and not demoji.replace(self.text).strip())
Пример #6
0
def data_cleaner_tweets(column):
    pattern = re.compile('RT @[\w_]+: |https?:\/\/.*[\r\n]*|[^A-Za-z0-9]+')
    column = column.apply(lambda x: demoji.replace(str(x), ''))
    column = column.apply(lambda x: re.sub(pattern, ' ', str(x)))
    #column = column.apply(lambda x:re.sub('[^A-Za-z0-9]+',' ',x))
    #column = column.apply(lambda x:re.sub('[^A-Za-z0-9]+',' ',x))
    return column
Пример #7
0
def url_remove(text):
    # demojiでUnicode絵文字を除去
    text = demoji.replace(text, '')
    # 正規表現でカスタム絵文字を除去
    text = re.sub(r'<:\w*:\d*>', '', text)
    # 正規表現でメンションを除去
    text = discord.utils.escape_markdown(text=text)
    text = text.replace('\*', '')
    text = text.replace('\_', '')
    text = text.replace('\~', '')
    text = text.replace('\|', '')
    text = text.replace('\`', '')
    text = text.replace('> ', '')
    text = discord.utils.escape_mentions(text=text)
    text = re.sub(r'<@​!\d+>', '', text)
    text = re.sub(r'<@​&\d+>', '', text)
    # 正規表現でチャンネルメンションを除去
    text = re.sub(r'<#\d+>', '', text)
    # URL除去
    if text.find('http') != -1:
        pattern = "https?://[\w/:%#\$&\?\(\)~\.=\+\-]+"
        url_list = re.findall(pattern, text)
        for item in url_list:
            text = text.remove(item)
    return text
Пример #8
0
def build_reply(request, reply_type):
    uploaded_link = request['uploaded_link']
    submission = reddit.submission(request['submission_id'])

    nsfw_warning = ''
    if submission.over_18:
        nsfw_warning = '**NSFW** '

    reply = f"###[{nsfw_warning}{config['DOWNLOAD_TEXT']}]({uploaded_link})"

    header = ''
    announcement = ''
    if request['type'] == 'comment' and reply_type == 'comment':
        announcement = config['ANNOUNCEMENT_COMMENT']
        header = config['HEADER']
    elif request['type'] == 'comment' and reply_type == 'message':
        announcement = config['ANNOUNCEMENT_PM']

    reply = header + reply + announcement

    # Footer
    footer = config['FOOTER']
    if request['sub'] in config['NO_FOOTER_SUBS']:
        footer = ""

    # Emojis
    if request['sub'] in config['NO_EMOJI_SUBS']:
        reply = demoji.replace(reply, "")

    return reply + reddit_tube_ad + footer
def get_df_analysis(spotipyUserAuth, tracks_df, segments=True, min_conf=0.5,
                    min_dur=0.25, tempo=True, sections=False, beats=False, bars=False):
    '''
    spotipyUserAuth : Spotipy auth object.
    playlist_id : playlist id
    segments and tempo: Default True. False if not needed
    min_conf: minimum confidence to include a segment (range 0-1)
    min_dur : minimum duration/length in secs to include a segment
    sections/beats/bars: Default False. True if needs to be returned

    Returns : a dict with key/value pairs for all tracks in the playlist
                Keys: name of track
                Value: list containing tempo and segment dataframe of the track
                       (and sections/beats/bars if asked)
    '''

    tracks_name = list(tracks_df['name'])
    tracks_id = list(tracks_df['id'])
    artists_name = list(tracks_df['artists_name'])
    # track_analysis returns a list of dictionary
    tracks_analysis = sc.get_tracks_analysis(spotipyUserAuth, tracks_id)
    df_analysis = {}

    for name_, artists_name_, track_analysis in zip(tracks_name, artists_name, tracks_analysis):

        # remove any special characters from name (they may cause issues in filenaming)
        name_ = re.sub(r'[*|><:"?/]|\\', "", name_)
        name_ = demoji.replace(name_)
        artists_name_ = re.sub(r'[*|><:"?/]|\\', "", artists_name_)
        name_ = name_ + '-' + artists_name_[:3]
        df_analysis[name_] = sc.get_segments(track_analysis, segments=segments,
                                             min_conf=min_conf, min_dur=min_dur, tempo=tempo,
                                             sections=sections, beats=beats, bars=bars)
    return df_analysis
Пример #10
0
def clean_text(text):
    '''
    Removes mentions, non alphanumeric/twitter characters, and emojis
    '''
    t = text.lower()
    t = remove_mentions(t)
    t = re.sub(non_alphanumeric_or_twitter_characters," ",t)
    return demoji.replace(t) # remove emojis with " "
Пример #11
0
def modify_name(name: str) -> str:
    if not demoji.last_downloaded_timestamp():
        demoji.download_codes()

    name = demoji.replace(string=name, repl="")
    name = unicodedata.normalize("NFKC", name)
    name = iuliia.translate(source=name, schema=iuliia.WIKIPEDIA)
    return name.title()
Пример #12
0
def clean_tweets(tweet):
    tweet = tweet.lower()  # String to lower case
    tweet = re.sub("@[A-Za-z0-9]+", "", tweet)  # Remove mentions
    tweet = re.sub("http\S+", "", tweet)  # Remove links
    tweet = re.sub(r'[^\w\s]', "", tweet)  # Remove punctuation
    tweet = demoji.replace(tweet, "")  # Remove emojis
    tweet = tweet.replace("\n", " ")  # Remove new lines

    return tweet
def beautify_tweet(row):
    tweet_text = demoji.replace(row["tweet_text"],"")
    tweet_text = re.sub(r"(?:\@|https?\://)\S+", "", tweet_text)
    tweet_text = re.sub(r"#(\w+)","",tweet_text)
    tweet_text = re.sub("RT|,|[\W\d!@#$%&*:\/]*…|\*|&amp;|&gt;|&lt;",
           "", tweet_text)
    tweet_text = re.sub("’|‘|”|“", "", tweet_text)
    tweet_text = re.sub("[-«\[\]\(\)\<\>\{\}»—\\\@\#\$\%\&\:\/]|[\.]+|[\!]+|[\?]+|\n", "", tweet_text)
    return tweet_text
Пример #14
0
    def is_simple_emoji_or_textual_reaction(self) -> bool:
        if is_disallowed_reaction(self.text):
            return False
        if len(self.text) == 1 or self.text in TEXTUAL_REACTIONS:
            return True

        found_reactions = find_emojis_in_str(self.text)
        return len(found_reactions) == 1 and not demoji.replace(
            self.text).strip()
Пример #15
0
    def de_emojify(self, text):
        """Removes emojis and some special characters from the text.

        :param text: Text that contains emoji
        """
        logger.info("Removing emojis...")
        regrex_pattern = re.compile(pattern="[\u2069\u2066]+",
                                    flags=re.UNICODE)
        text = regrex_pattern.sub('', text)
        return demoji.replace(text, " ")
Пример #16
0
def processTweet(tweet):
    tweet = demoji.replace(tweet) #removes emoticons
    tweet = tweet.lower() #lowercase
    tweet = re.sub(r'((www\.[^\s]+)|(https?://[^\s]+))', '', tweet) #removes urls
    tweet = re.sub(r'@[^\s]+', '', tweet) #removes usernames
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet) #removes hastags
    tweet = ''.join(ch for ch in tweet if ch not in set(punctuation)) # removes punctuation
    tweet = ''.join(ch for ch in tweet if ch not in ['…']) #removes elipses (not included in punctuation)
    tweet = word_tokenize(tweet) # tokenize string into list
    return [word for word in tweet if word not in _stopwords] # removes english stopwords
Пример #17
0
def preprocessing(i, file, userList):
    tweet_dic = json.loads(i)
    #date
    date = time.strftime(
        '%Y-%m-%dT%H:00:00Z',
        time.strptime(tweet_dic["created_at"], "%a %b %d %H:%M:%S +0000 %Y"))
    format_str = '%Y-%m-%dT%H:00:00Z'
    dt = datetime.strptime(date, format_str)
    final = dt + timedelta(hours=1)
    #reply_text, poi_id, poi_name
    tweet_dic['tweet_date'] = final.strftime(format_str)
    if ('full_text' in tweet_dic.keys()):
        print("tweet has full_text")
    elif ('text' in tweet_dic.keys()):
        print("has text instead of full_text")
        tweet_dic['full_text'] = tweet_dic['text']
    else:
        print("No full_text or text")
        return
    if tweet_dic['in_reply_to_status_id'] is not None:
        #print("wkefnwlm")
        if tweet_dic['in_reply_to_screen_name'] not in userList:
            tweet_dic['poi_name'] = tweet_dic['user']['screen_name']
            tweet_dic['poi_id'] = tweet_dic['user']['id']
        else:
            tweet_dic['poi_name'] = tweet_dic['in_reply_to_screen_name']
            tweet_dic['poi_id'] = tweet_dic['in_reply_to_user_id']
        tweet_dic['reply_text'] = tweet_dic['full_text']
    else:
        #print("welfnwelknfdwm")
        #print(tweet_dic)
        tweet_dic['poi_name'] = tweet_dic['user']['screen_name']
        tweet_dic['poi_id'] = tweet_dic['user']['id']
        tweet_dic['reply_text'] = None
    print(tweet_dic['poi_name'])
    #country
    screen_name = tweet_dic['poi_name']
    tweet_dic['country'] = "India"
    if screen_name in india_list:
        tweet_dic['country'] = "India"
    elif screen_name in usa_list:
        tweet_dic['country'] = 'USA'
    elif screen_name in brazil_list:
        tweet_dic['country'] = 'brazil'
    else:
        print("error poi {}".format(tweet_dic['id']))
    #text_xx
    #text_xx = "text_" + str(tweet_dic['lang'])
    full_text = tweet_dic['full_text']
    tweet_dic['text_copy'] = demoji.replace(full_text)
    tweet_dic['tweet_emotions'] = list(demoji.findall(full_text).keys())
    #time.sleep(1)
    json.dump(tweet_dic, file, ensure_ascii=False)
    #time.sleep(5)
    file.write("\n")
Пример #18
0
def process_tweets(tweets, stemming=False, stemmer=None, banned_accounts=[]):
    processed_tweets = []
    for tweet in tweets:
        logging.info('Processing tweet: {}'.format(tweet['id']))
        if 'complete_text' in tweet:
            tweet_txt = tweet['complete_text']
            del tweet['complete_text']
        else:
            tweet_txt = tweet['text']
        if tweet['user']['screen_name'] in banned_accounts:
            continue
        # remove emojis, urls, mentions
        processed_txt = tw_preprocessor.clean(tweet_txt)
        processed_txt = demoji.replace(processed_txt).replace('\u200d️',
                                                              '').strip()
        processed_txt = emoji.get_emoji_regexp().sub(u'', processed_txt)
        tokens = [token.lower() for token in wordpunct_tokenize(processed_txt)]
        if tweet['lang'] == 'es':
            stop_words = stopwords.words('spanish')
            punct_signs = ['.', '[', ']', ',', ';', ')', '),', '(']
            stop_words.extend(punct_signs)
            words = [token for token in tokens if token not in stop_words]
            if stemming:
                stemmers = [stemmer.stem(word) for word in words]
                processed_txt = ' '.join([
                    stem for stem in stemmers
                    if stem.isalpha() and len(stem) > 1
                ])
            else:
                processed_txt = ' '.join(word for word in words)
        else:
            processed_txt = ' '.join([token for token in tokens])
        tweet['text'] = processed_txt
        if 'sentiment' in tweet:
            tweet['sentiment_polarity'] = tweet['sentiment']['score']
            del tweet['sentiment']
        tweet['hashtags'] = []
        for hashtag in tweet['entities']['hashtags']:
            tweet['hashtags'].append(hashtag['text'])
        #tweet['urls'] = []
        #for url in tweet['entities']['urls']:
        #    tweet['urls'].append(url['expanded_url'])
        tweet['mentions'] = []
        for mention in tweet['entities']['user_mentions']:
            tweet['mentions'].append(mention['screen_name'])
        del tweet['entities']
        tweet[
            'url'] = f"http://www.twitter.com/{tweet['user']['screen_name']}/status/{tweet['id']}"
        del tweet['user']
        dt = datetime.strptime(tweet['created_at_date'], '%Y-%m-%d')
        tweet['month'] = dt.month
        tweet['year'] = dt.year
        tweet['week_month'] = f'{week_of_month(dt)}-{dt.month}'
        processed_tweets.append(tweet)
    return processed_tweets
Пример #19
0
async def say(ctx, *, arg: str):
    filename = f"{uuid.uuid4().hex}.wav"
    pruned = re.sub('(<:.*:\d*>)', '', arg).strip()
    cleaned = demoji.replace(pruned, "")
    if setup['azure']['max_chars'] == 0 or len(
            cleaned) <= setup['azure']['max_chars']:
        await speak(ctx, cleaned, filename, True)
    else:
        await ctx.send(
            f"The message is longer than {setup['azure']['max_chars']} characters"
        )
Пример #20
0
 def remove_emoji(self):
     # emoji_pattern = re.compile(
     #     u'(\U0001F1F2\U0001F1F4)|'  # Macau flag
     #     u'([\U0001F1E6-\U0001F1FF]{2})|'  # flags
     #     u'([\U0001F600-\U0001F64F])'  # emoticons
     #     "+", flags=re.UNICODE)
     # self.filename = emoji_pattern.sub('', self.filename)
     try:
         self.filename = demoji.replace(self.filename, '#')
     except IOError:
         demoji.download_codes()
def filter_tweets_coronavirus(orig_tweet_text, nphrases_dict, country):
    orig_text = orig_tweet_text.strip().lower()
    orig_text = re.sub("&amp;|&gt;|&lt;"," ", orig_text)
    orig_text = orig_text.replace("\n", " ").replace("#","").replace("@", "").lower().strip()
    orig_text = demoji.replace(orig_text," ")
    for ph,ph_country in nphrases_dict.items():
        orig_ph_pos = orig_text.find(ph)
        if country == ph_country or ph_country == "na":
            if  find_word_in_str(orig_text, ph, True):
                return True
    return False
Пример #22
0
def remove_emoji(data):
    updated_data = {}
    import demoji
    demoji.download_codes()
    for key in list(data.keys()):
        new_lines = []
        for line in data[key]:
            updated = demoji.replace(line, repl="").strip()
            if updated:
                new_lines.append(updated)
        updated_data[key] = new_lines
    return updated_data
def assign_country_tweets(orig_tweet_text, tweet_text, nphrases, phrases_country, phrases_country_freq, phrases_hashtag, eu_countries, coded_countries):
    hashtags =  set()
    countries = defaultdict(int)
    text = tweet_text.lower().strip()
    final_country = set()

    orig_tweet_text = re.sub("&amp;|&gt;|&lt;"," ", orig_tweet_text)
    orig_text = orig_tweet_text.replace("\n", " ").replace("#","").replace("@", "").lower().strip()
    orig_text = demoji.replace(orig_text," ")
    for ph in nphrases:
        ph_pos = text.find(ph)
        orig_ph_pos = orig_text.find(ph)
        if  find_word_in_str(text, ph) or find_word_in_str(orig_text, ph, True):
            if phrases_hashtag.get(ph, None) is not None: 
                hashtags.add(phrases_hashtag[ph])

            #if phrases_country.get(ph, None) is not None:
            #    if phrases_country[ph] == "eu" or phrases_country[ph] not in only_en_countries:
            #            countries["eu"] += phrases_country_freq[ph]
            #    else:
            #        countries[phrases_country[ph]] += phrases_country_freq[ph]

    #for ph, ctry in coded_countries.items():
    #    ph_pos = text.find(ph)
    #    orig_ph_pos = orig_text.find(ph)
    #    if  find_word_in_str(text, ph) or find_word_in_str(orig_text, ph, True):
    #        final_country.add(ctry)

    if len(hashtags) == 0:
        hashtags = None
    #if len(countries) == 0 and len(final_country) == 0:
    #    final_country = "eu"
    #else:
    #    if len(final_country) > 0:
    #        if "usa" in final_country:
    #            final_country = "usa"
    #        elif "uk" in final_country:
    #            final_country = "uk"
    #        else:
    #            final_country = "eu"
    #    else:

    #        country_freq_max = 0

    #        if countries.get("usa", None) is not None:
    #            final_country = "usa"
    #        elif countries.get("uk", None) is not None:
    #            final_country = "uk"
    #        else:
    #            final_country = "eu"
    #return [final_country, hashtags]
    return hashtags
Пример #24
0
 def translate_from_bel(self, text):
     text = demoji.replace(text, " ")
     translator = Translator()
     lan = translator.detect(text).lang
     if lan != 'be':
         return text
     result = translator.translate(text, src='be', dest='ru')
     translated_text = result.text
     for whole, first_digit, second_digit in time_fix_re.findall(
             translated_text):
         translated_text = translated_text.replace(
             whole, "%s:%s" % (first_digit, second_digit))
     return translated_text
Пример #25
0
def expand_urls(urls):
    request_base = "https://b6dkeerw62x81o1j.pro.urlex.org/json/"

    request_payload = "***".join(urls)
    new_request_payload = demoji.replace(request_payload)
    request_url = request_base + new_request_payload

    try:
        with urllib.request.urlopen(request_url) as response:
            data = response.read()
            return data
    except:
        return request_url
Пример #26
0
    def obtain_data(self):
        if self.parser_type == 'electricity':
            url_template = 'https://www.epsdistribucija.rs/Dan_{{day}}_Iskljucenja.htm'
            for i in range(0, 4):
                url = url_template.replace('{{day}}', str(i))
                self.make_request(url)
                self.parse_data()
        else:
            bus_route_url = os.environ.get('BUS_ROUTE_URL')
            feed = feedparser.parse(bus_route_url)
            tags_to_skip = [
                'Aktivne izmene na linijama', 'Planirane izmene', 'Informacija'
            ]
            current_changes = []

            for post in feed.entries:
                for tag in post.tags:
                    if tag.term not in tags_to_skip:
                        bus_route_number = tag.term.split('a ')[1]
                        bus = Bus.where('bus_route_number', '=',
                                        bus_route_number).first()
                        content = self.get_data_for_bus(post.link)
                        current_live_route_change = demoji.replace(content)
                        if bus is None:
                            bus = Bus()
                            bus.bus_route_number = bus_route_number
                            bus.save()

                        data = {bus.id: []}
                        data[bus.id].append(current_live_route_change)
                        current_changes.append(data)

                for change in current_changes:
                    for bus_id in change:
                        changes_to_save = change[bus_id]
                        bus = Bus.find(bus_id)
                        saved_routes = bus.bus_route
                        for saved_route in saved_routes:
                            for change_to_save in changes_to_save:
                                if hashlib.blake2b(
                                        saved_route.route_change.encode(
                                            'utf-8')).hexdigest(
                                            ) == hashlib.blake2b(
                                                change_to_save.encode(
                                                    'utf-8')).hexdigest():
                                    changes_to_save.remove(change_to_save)

                        for change_to_save in changes_to_save:
                            self.update_bus_route(BusRoute(), bus,
                                                  change_to_save)
def get_folder_features(spotipyUserAuth, filsort_pl=None, pl_name_id=None):
    '''
    Here, we will be using filtered and sorted output. Future edit should take user
    playlist names and id.

    spotipyUserAuth : Spotipy auth object.

    filsort_pl : Default None. Uses 4-tuple output from filtersort_playlist function.
    pl_name_id : Dafault None. In the case filsort_pl is not available,
                 provide list of playlist name and id tuples

    Returns: a dict with key/value pairs for all playlists in the folder.
             Key : Name of the playlist (string)
             Value : pandas.DataFrame returned from get_playlist_features
    '''

    folder_features = {}

    if filsort_pl is not None:
        for p in filsort_pl:

            # remove any special characters from name (they may cause issues in filenaming)
            pl_name = re.sub(r'[*|><:"?/]|\\', "", p[1])
            pl_name = demoji.replace(pl_name)
            folder_features[pl_name] = get_playlist_features(spotipyUserAuth,
                                                             playlist_id=p[2])
    else:
        for p in pl_name_id:

            # remove any special characters from name (they may cause issues in filenaming)

            pl_name = re.sub(r'[*|><:"?/]|\\', "", p[0])
            pl_name = demoji.replace(pl_name)
            folder_features[pl_name] = get_playlist_features(spotipyUserAuth,
                                                             playlist_id=p[1])

    return folder_features
Пример #28
0
def format_content_lite(content: str, users: dict = None, newlines: bool = True) -> str:
    """Format raw text content to recognizable HTML

    This is designated the lite function because some parts of Discord require special parsing rules.
    """

    # Encode multiline codeblocks (```text```)
    content = re.sub(r'```+((?:[^`]*?\n)?(?:[\s\S]+))\n?```+', _encode_codeblock, content)

    # Encode URLs
    content = re.sub(r'(\b(?:(?:https?|ftp|file)://|www\.|ftp\.)(?:\([-a-zA-Z0'
                     r'-9+&@#/%?=~_|!:,.\[\];]*\)|[-a-zA-Z0-9+&@#/%?=~_|!:,.'
                     r'\[\];])*(?:\([-a-zA-Z0-9+&@#/%?=~_|!:,.\[\];]*\)|[-a-z'
                     r'A-Z0-9+&@#/%=~_|$]))', _encode_url, content)

    # HTML-encode content
    content = html.escape(content)

    # Encode inline codeblocks (`text` or ``text``)
    content = re.sub(r'(``?)([^`]+)\1', _encode_inline_codeblock, content)

    # Encode mentions
    content = re.sub(r'((@everyone)|(@here)|(&lt;@!?(\d+)&gt;)|(&lt;@((.{2,32}?)#\d{4}) \((\d+)\)&gt;)|'
                     r'(@((.{2,32}?)#\d{4}))|(&lt;#\d+&gt;)|(&lt;#(.{1,100}?)&gt;)|(&lt;@&amp;(\d+)&gt;)|'
                     r'(&lt;@&amp;(.{1,100}?)&gt;))', _encode_mentions, content)

    # Encode escaped emojis
    content = re.sub(ESCAPED_EMOJI_PAT, _encode_emojis, content)

    # Process emojis (:text:)
    content = re.sub(EMOJI_REGEX, _process_emojis, content)

    # Process unicode emojis
    content = demoji.replace(content, lambda m: _process_unicode_emojis(m, 'emoji'))

    # Process bold (**text**)
    content = re.sub(r'\*\*((?:\\[\s\S]|[^\\])+?)\*\*(?!\*)', r'<b>\1</b>', content)

    # Process underline (__text__)
    content = re.sub(r'__((?:\\[\s\S]|[^\\])+?)__(?!_)', r'<u>\1</u>', content)

    # Process italic (*text* or _text_)
    content = re.sub(r'\b_((?:__|\\[\s\S]|[^\\_])+?)_\b|\*(?=\S)((?:\*\*|\\[\s\S]|\s+(?:\\[\s\S]|[^\s*\\]|\*\*)|'
                     r'[^\s*\\])+?)\*(?!\*)', r'<i>\1\2</i>', content)

    # Process strike through (~~text~~)
    content = re.sub(r'~~(?=\S)((?:\\[\s\S]|~(?!~)|[^\s\\~]|\s+(?!~~))+?)~~', r'<s>\1</s>', content)

    return _format_content(content, users, newlines)
Пример #29
0
def download_submissions(subreddit, limit=1000):
    settings = {}
    with open('opts') as opts:
        for line in opts:
            name, setting = line.partition('=')[::2]
            settings[name.strip()] = setting.strip()

    reddit = praw.Reddit(client_id=settings['id'],
                         client_secret=settings['secret'],
                         user_agent=settings['user_agent'])

    d = {
        'author': [],
        'clicked': [],
        #'comments': [],
        'created_utc': [],
        'distinguished': [],
        'edited': [],
        'id': [],
        'is_original_content': [],
        'is_self': [],
        'link_flair_template_id': [],
        'link_flair_text': [],
        'locked': [],
        'name': [],
        'num_comments': [],
        'over_18': [],
        'permalink': [],
        #'poll_data': [],
        'score': [],
        'selftext': [],
        'spoiler': [],
        'stickied': [],
        #'subreddit': [],
        'title': [],
        'upvote_ratio': [],
        'url': []
    }
    for submission in reddit.subreddit(subreddit).top('all', limit=limit):
        for key in d:
            val = getattr(submission, key, 'None')
            if isinstance(val, str):
                val = demoji.replace(val)
                val = re.sub('[^A-Za-z0-9]+', '', val)
            d[key].append(val)

    df = pd.DataFrame(data=d)

    df.to_csv('{}_dump.csv'.format(subreddit.strip()))
Пример #30
0
def preprocess(data):
    data = re.sub(r'(https?:\/\/\S+)','',str(data))
    data = convert_lower_case(data)
    data = remove_media_omit(data)
    data = remove_punctuation(data) #remove comma seperately
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = convert_numbers(data)
#    data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
    data = lemmatize(data)
#    data = remove_punctuation(data)
    data = demoji.replace(str(data).strip(),'')
    data = word_tokenize(str(data))
#    data = spacy_preprocess(data)
    return data