예제 #1
0
 def __init__(self, **kwargs):
     self.tokens_to_normalize = kwargs.get("normalize", [])
     self.annotate = kwargs.get("annotate", [])
     self.unpack_hashtags = kwargs.get("unpack_hashtags", False)
     self.unpack_contractions = kwargs.get("unpack_contractions", False)
     self.segmenter_corpus = kwargs.get("segmenter", "english")
     self.corrector_corpus = kwargs.get("corrector", "english")
     self.segmenter = Segmenter(corpus=self.segmenter_corpus)
     self.spell_corrector = SpellCorrector(corpus=self.corrector_corpus)
     self.tokenizer = kwargs.get("tokenizer", None)
     self.simplify_emoticons = kwargs.get("simplify_emoticons", False)
     self.dictionaries = kwargs.get("dictionaries", [])
     self.stats = {}
     self.preprocessed_texts = -1
예제 #2
0
def hashtag_sentiment(tweet):
    hash_tag = (re.findall("#([a-zA-Z0-9]{1,25})", tweet))
    seg = Segmenter()
    hashtag_polarity = []
    for hashtag in hash_tag:
        tokens = seg.segment(hashtag)
        ss = sid.polarity_scores(tokens)
        if 'not' not in tokens.split(' '):
            hashtag_polarity.append(ss['compound'])
        else:
            hashtag_polarity.append(-ss['compound'])
    sentiment = 0
    if len(hashtag_polarity) > 0:
        sentiment = round(
            float(sum(hashtag_polarity) / float(len(hashtag_polarity))), 2)
    return sentiment
def hashtag_sentiment(tweet):
    hash_tag = (re.findall("#([a-zA-Z0-9]{1,25})", tweet))
    hashtag_polarity = []
    seg = Segmenter(corpus="twitter") 
    for hashtag in hash_tag:
        tokens = seg.segment(hashtag)
        ss = sid.polarity_scores(tokens) # polarity_scores method of SentimentIntensityAnalyzer 
   										 # object gives a sentiment dictionary. 
    									 # which contains pos, neg, neu, and compound scores. 
        if 'not' not in tokens.split(' '):
            hashtag_polarity.append(ss['compound'])
        else:
            hashtag_polarity.append(- ss['compound'])
    sentiment = 0
    if len(hashtag_polarity) > 0:
        sentiment = round(float(sum(hashtag_polarity) / float(len(hashtag_polarity))), 2)
    return sentiment
예제 #4
0
def handle_tweets(df_tweets):
    seg_eng = Segmenter(corpus="english")
    texts = list(df_tweets["text"])
    #f = open(data_path + "abs_tweets.txt", "w")
    hashtags = []
    clean_tweets = []
    for t in texts:
        pattern = r'#\w+|#\w+$'
        remove = re.compile(pattern)
        removed_t = remove.sub(r'', t)
        matches = re.findall(pattern, t)
        hashes = [seg_eng.segment(i.lstrip('#').lower()) for i in matches]
        tweet = tokenizer(removed_t)
        clean_tweets.append(tweet)
        hashtags.append(hashes)
    #   f.write(tweet)
    #  f.write("\n")
    #f.close()
    return clean_tweets, hashtags
def extractHashtags(dataset):
    seg_tw = Segmenter(corpus="twitter")
    stop_words = set(stopwords.words('english'))
    dataset['hashtags'] = dataset['text'].apply(lambda x: re.findall(
        r"#(\w+)", x)).apply(lambda x: splitUpTweets(x, seg_tw))

    # # Remove stop words in segmented tweet
    # for i in range(len(dataset['hashtags'])):
    #     if dataset['hashtags'][i] is not None:
    #         dataset['hashtags'][i] = list(filter(lambda a: ((a not in stop_words) & (a != "_")), dataset['hashtags'][i]))
    return dataset
예제 #6
0
 def segmentation(self):
     from ekphrasis.classes.segmenter import Segmenter
     seg_eg = Segmenter(corpus="english")
     seg_tw = Segmenter(corpus="twitter")
     self.text = [seg_tw.segment(sent) for sent in self.text]
     return self.text
import random
import re
from ekphrasis.classes.segmenter import Segmenter
listOfFILEcomments = []
listOfFILEposts = []
listOfFILEtags = []
# for i in range(0,17):
# 	listOfFILEcomments.append('allcomments' + str(i) + '.txt')
# 	listOfFILEposts.append('allposts' + str(i) + '.txt')
# 	listOfFILEtags.append('alltags' + str(i) + '.txt')

seg_eng = Segmenter(corpus="english")

listOfFILEcomments = ['allcomments.txt']
listOfFILEposts = ['allposts.txt']
listOfFILEtags = ['alltags.txt']

fhc = open('finalallcomments.txt', 'a+')
fhp = open('finalallposts.txt', 'a+')
fht = open('finalalltags.txt', 'a+')
for commentFILE, postFILE, tagFILE in zip(listOfFILEcomments, listOfFILEposts,
                                          listOfFILEtags):
    commentGenerator = open(commentFILE, 'r')
    postGenerator = open(postFILE, 'r')
    tagGenerator = open(tagFILE, 'r')
    for comment, post, tag in zip(commentGenerator, postGenerator,
                                  tagGenerator):
        if comment.strip() and post.strip() and tag.strip():
            fhc.write(comment)
            fhp.write(post)
            fht.write(';'.join(
예제 #8
0
    def __init__(self, **kwargs):
        """
        Kwargs:
            omit (list): choose what tokens that you want to omit from the text.
                possible values: ['email', 'percent', 'money', 'phone', 'user',
                    'time', 'url', 'date', 'hashtag']
                Important Notes:
                            1 - put url at front, if you plan to use it.
                                Messes with the regexes!
                            2 - if you use hashtag then unpack_hashtags will
                                automatically be set to False

            normalize (list): choose what tokens that you want to normalize
                from the text.
                possible values: ['email', 'percent', 'money', 'phone', 'user',
                    'time', 'url', 'date', 'hashtag']
                for example: [email protected] will be transformed to <email>
                Important Notes:
                            1 - put url at front, if you plan to use it.
                                Messes with the regexes!
                            2 - if you use hashtag then unpack_hashtags will
                                automatically be set to False

            unpack_contractions (bool): Replace *English* contractions in
                ``text`` str with their unshortened forms
                for example: can't -> can not, wouldn't -> would not, and so on...

            unpack_hashtags (bool): split a hashtag to it's constituent words.
                for example: #ilikedogs -> i like dogs

            annotate (list): add special tags to special tokens.
                possible values: ['hashtag', 'allcaps', 'elongated', 'repeated']
                for example: [email protected] -> [email protected] <email>

            tokenizer (callable): callable function that accepts a string and
                returns a list of strings if no tokenizer is provided then
                the text will be tokenized on whitespace

            segmenter (str): define the statistics of what corpus you would
                like to use [english, twitter]

            corrector (str): define the statistics of what corpus you would
                like to use [english, twitter]

            all_caps_tag (str): how to wrap the capitalized words
                values [single, wrap, every]
                Note: applicable only when `allcaps` is included in annotate[]
                    - single: add a tag after the last capitalized word
                    - wrap: wrap all words with opening and closing tags
                    - every: add a tag after each word

            spell_correct_elong (bool): choose if you want to perform
                spell correction after the normalization of elongated words.
                * significantly affects performance (speed)

            spell_correction (bool): choose if you want to perform
                spell correction to the text
                * significantly affects performance (speed)

            fix_text (bool): choose if you want to fix bad unicode terms and
                html entities.
        """
        self.omit = kwargs.get("omit", {})
        self.backoff = kwargs.get("normalize", {})
        self.include_tags = kwargs.get("annotate", {})
        self.unpack_contractions = kwargs.get("unpack_contractions", False)
        self.tokenizer = kwargs.get("tokenizer", None)
        self.dicts = kwargs.get("dicts", None)
        self.spell_correction = kwargs.get("spell_correction", False)
        self.spell_correct_elong = kwargs.get("spell_correct_elong", False)
        self.fix_text = kwargs.get("fix_bad_unicode", False)
        self.unpack_hashtags = kwargs.get("unpack_hashtags", False)
        self.segmenter_corpus = kwargs.get("segmenter", "english")
        self.corrector_corpus = kwargs.get("corrector", "english")
        self.all_caps_tag = kwargs.get("all_caps_tag", "wrap")
        self.mode = kwargs.get("mode", "normal")

        if self.unpack_hashtags:
            self.segmenter = Segmenter(corpus=self.segmenter_corpus)
        if self.mode != "fast":
            self.spell_corrector = SpellCorrector(corpus=self.corrector_corpus)

        self.regexes = ExManager().get_compiled()
        if 'hashtag' in self.omit or 'hashtag' in self.backoff:
            print("You can't omit/backoff and unpack hashtags!\n "
                  "unpack_hashtags will be set to False")
            self.unpack_hashtags = False
예제 #9
0
import pandas as pd
import re
from nltk.corpus import stopwords
import pickle
import ekphrasis
import numpy as np
from ekphrasis.classes.segmenter import Segmenter
from process_tweets import tokenizer, ner

stop_words = set()
stop_words.add("rt")
stop_words.add("RT")
seg_eng = Segmenter(corpus="english")


# get only tweets with notinline hashtags, for inline hashtags it split into words, for not inline, it deleted
def get_inline_notinline_htags(list_words):
    end_wt_punc = re.compile(r'#\w+[^a-zA-Z\d\s]')
    all_letters = re.compile(r'\w+$')
    all_letters_wt_punc = re.compile(r'\w+$|\w+[:,.]$')
    end = -1
    inline_ht, not_inline_ht = [], []
    for i in range(len(list_words)):
        if i <= end or not list_words[i].startswith("#"):
            continue
        end = i
        previous_word = None
        next_word = None
        if i > 0:
            previous_word = list_words[i - 1]
        for j in range(i + 1, len(list_words)):
예제 #10
0
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
# import networkx as nx
import os
import pickle
from data_util.my_stopwords import *
from data_util.extract_key import extract_PF

from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
# from ekphrasis.classes.segmenter import Segmenter

from ekphrasis.classes.segmenter import Segmenter
# segmenter using the word statistics from english Wikipedia
seg_eng = Segmenter(corpus="twitter") # english or twitter

from ekphrasis.classes.spellcorrect import SpellCorrector
sp = SpellCorrector(corpus="english") # english or twitter

alphbet_stopword = ['','b','c','d','e','f','g','h','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','#']

# 斷詞辭典
from nltk.corpus import stopwords as nltk_stopwords
nltk_stopwords = set(nltk_stopwords.words("english"))
stpwords_list3 = [f.replace("\n","") for f in open("data_util/stopwords.txt","r",encoding = "utf-8").readlines()]
stpwords_list3.remove("not")
stopwords = list(html_escape_table + stpwords_list2) + list(list(nltk_stopwords) + list(stpwords_list1) + list(stpwords_list3))
stopwords = stopwords + ["."] + alphbet_stopword
# stopwords = list(html_escape_table)  #+ list(stpwords_list1) + list(stpwords_list3)
print("斷詞辭典 已取得")
예제 #11
0
class TextPreProcessor:
    def __init__(self, **kwargs):
        """
        Kwargs:
            omit (list): choose what tokens that you want to omit from the text.
                possible values: ['email', 'percent', 'money', 'phone', 'user',
                    'time', 'url', 'date', 'hashtag']
                Important Notes:
                            1 - put url at front, if you plan to use it.
                                Messes with the regexes!
                            2 - if you use hashtag then unpack_hashtags will
                                automatically be set to False

            normalize (list): choose what tokens that you want to normalize
                from the text.
                possible values: ['email', 'percent', 'money', 'phone', 'user',
                    'time', 'url', 'date', 'hashtag']
                for example: [email protected] will be transformed to <email>
                Important Notes:
                            1 - put url at front, if you plan to use it.
                                Messes with the regexes!
                            2 - if you use hashtag then unpack_hashtags will
                                automatically be set to False

            unpack_contractions (bool): Replace *English* contractions in
                ``text`` str with their unshortened forms
                for example: can't -> can not, wouldn't -> would not, and so on...

            unpack_hashtags (bool): split a hashtag to it's constituent words.
                for example: #ilikedogs -> i like dogs

            annotate (list): add special tags to special tokens.
                possible values: ['hashtag', 'allcaps', 'elongated', 'repeated']
                for example: [email protected] -> [email protected] <email>

            tokenizer (callable): callable function that accepts a string and
                returns a list of strings if no tokenizer is provided then
                the text will be tokenized on whitespace

            segmenter (str): define the statistics of what corpus you would
                like to use [english, twitter]

            corrector (str): define the statistics of what corpus you would
                like to use [english, twitter]

            all_caps_tag (str): how to wrap the capitalized words
                values [single, wrap, every]
                Note: applicable only when `allcaps` is included in annotate[]
                    - single: add a tag after the last capitalized word
                    - wrap: wrap all words with opening and closing tags
                    - every: add a tag after each word

            spell_correct_elong (bool): choose if you want to perform
                spell correction after the normalization of elongated words.
                * significantly affects performance (speed)

            spell_correction (bool): choose if you want to perform
                spell correction to the text
                * significantly affects performance (speed)

            fix_text (bool): choose if you want to fix bad unicode terms and
                html entities.
        """
        self.omit = kwargs.get("omit", {})
        self.backoff = kwargs.get("normalize", {})
        self.include_tags = kwargs.get("annotate", {})
        self.unpack_contractions = kwargs.get("unpack_contractions", False)
        self.tokenizer = kwargs.get("tokenizer", None)
        self.dicts = kwargs.get("dicts", None)
        self.spell_correction = kwargs.get("spell_correction", False)
        self.spell_correct_elong = kwargs.get("spell_correct_elong", False)
        self.fix_text = kwargs.get("fix_bad_unicode", False)
        self.unpack_hashtags = kwargs.get("unpack_hashtags", False)
        self.segmenter_corpus = kwargs.get("segmenter", "english")
        self.corrector_corpus = kwargs.get("corrector", "english")
        self.all_caps_tag = kwargs.get("all_caps_tag", "wrap")
        self.mode = kwargs.get("mode", "normal")

        if self.unpack_hashtags:
            self.segmenter = Segmenter(corpus=self.segmenter_corpus)
        if self.mode != "fast":
            self.spell_corrector = SpellCorrector(corpus=self.corrector_corpus)

        self.regexes = ExManager().get_compiled()
        if 'hashtag' in self.omit or 'hashtag' in self.backoff:
            print("You can't omit/backoff and unpack hashtags!\n "
                  "unpack_hashtags will be set to False")
            self.unpack_hashtags = False

    def __copy__(self):
        return self

    def __deepcopy__(self, memo):
        return self

    @staticmethod
    def add_special_tag(m, tag, mode="single"):

        if isinstance(m, str):
            text = m
        else:
            text = m.group()

        if mode == "single":
            return " {} <{}> ".format(text, tag)
        elif mode == "wrap":
            return " ".join([" <{}> {} </{}> ".format(tag, text, tag)]) + " "
        elif mode == "every":
            tokens = text.split()
            processed = " ".join([" {} <{}> ".format(t, tag) for t in tokens])
            return " " + processed + " "

    @lru_cache(maxsize=4096)
    def handle_hashtag_match(self, m):
        """
        Break a string to its constituent words (using Viterbi algorithm)
        """
        text = m.group()[1:]

        # todo:simplify routine
        if text.islower():
            expanded = self.segmenter.segment(text)
            expanded = " ".join(expanded.split("-"))
            expanded = " ".join(expanded.split("_"))
            # print(m.group(), " - ", expanded)
            # with open("analysis/segmenter_" +
            # self.segmenter_corpus + ".txt", "a") as f:
            #     f.write(m.group() + "\t" + expanded + "\n")

        else:
            # split words following CamelCase convention
            expanded = self.regexes["camel_split"].sub(r' \1', text)
            expanded = expanded.replace("-", "")
            expanded = expanded.replace("_", "")
            # print(m.group(), " - ", expanded)

        if "hashtag" in self.include_tags:
            expanded = self.add_special_tag(expanded, "hashtag", mode="wrap")

        return expanded

    def handle_elongated_match(self, m):
        text = m.group()

        # normalize to at most 2 repeating chars
        text = self.regexes["normalize_elong"].sub(r'\1\1', text)

        normalized = self.spell_corrector.normalize_elongated(text)
        if normalized:
            text = normalized

        # try to spell correct the word
        if self.spell_correct_elong:
            text = self.spell_corrector.correct_word(text,
                                                     assume_wrong=True,
                                                     fast=True)
            # with open("analysis/spell_corrector_" +
            # self.corrector_corpus + ".txt", "a") as f:
            #     f.write(m.group() + " - " + text + "\n")

            # print(m.group(), "-", text)
        if "elongated" in self.include_tags:
            text = self.add_special_tag(text, "elongated")

        return text

    @lru_cache(maxsize=4096)
    def handle_repeated_puncts(self, m):
        """
        return the sorted set so mathes random combinations of puncts
        will be mapped to the same token
        "!??!?!!", "?!!!!?!", "!!?", "!?!?" --> "?!"
        "!...", "...?!" --> ".!"
        :param m:
        :return:
        """
        text = m.group()
        text = "".join(sorted(set(text), reverse=True))

        if "repeated" in self.include_tags:
            text = self.add_special_tag(text, "repeated")

        return text

    @lru_cache(maxsize=4096)
    def handle_generic_match(self, m, tag, mode="every"):
        """

        Args:
            m ():
            tag ():
            mode ():

        Returns:

        """
        text = m.group()
        text = self.add_special_tag(text, tag, mode=mode)

        return text

    @lru_cache(maxsize=4096)
    def handle_emphasis_match(self, m):
        """
        :param m:
        :return:
        """
        text = m.group().replace("*", "")
        if "emphasis" in self.include_tags:
            text = self.add_special_tag(text, "emphasis")

        return text

    @staticmethod
    def dict_replace(wordlist, _dict):
        return [_dict[w] if w in _dict else w for w in wordlist]

    @staticmethod
    def remove_hashtag_allcaps(wordlist):
        in_hashtag = False
        _words = []
        for word in wordlist:

            if word == "<hashtag>":
                in_hashtag = True
            elif word == "</hashtag>":
                in_hashtag = False
            elif word in {"<allcaps>", "</allcaps>"} and in_hashtag:
                continue

            _words.append(word)

        return _words

    @lru_cache(maxsize=4096)
    def handle_general_word_segment_and_spelling(self, m):
        """
        :param m:
        :return:
        """
        text = m.group()
        text = self.segmenter.segment(text)

        return text

    def pre_process_doc(self, doc):

        doc = re.sub(r' +', ' ', doc)  # remove repeating spaces

        # ###########################
        # # fix bad unicode
        # ###########################
        # if self.fix_bad_unicode:
        #     doc = textacy.preprocess.fix_bad_unicode(doc)
        #
        # ###########################
        # # fix html leftovers
        # ###########################
        # doc = html.unescape(doc)

        ###########################
        # fix text
        ###########################
        if self.fix_text:
            doc = ftfy.fix_text(doc)

        ###########################
        # BACKOFF & OMIT
        ###########################
        for item in self.backoff:
            # better add an extra space after the match.
            # Just to be safe. extra spaces will be normalized later anyway
            doc = self.regexes[item].sub(
                lambda m: " " + "<" + item + ">" + " ", doc)
        for item in self.omit:
            doc = doc.replace("<" + item + ">", '')

        ###########################
        # segment other words not hashtags
        ###########################

        # doc = self.regexes['not_hashtag'].sub(
        # lambda w: self.handle_general_word_segment_and_spelling(w), doc)

        # for word in doc.split(" "):
        # if(not word.startswith('#')):
        # word = self.segmenter.segment(word)
        # new_doc.append(word)
        # doc = " ".join(new_doc)

        ###########################
        # unpack hashtags
        ###########################

        if self.unpack_hashtags:
            doc = self.regexes["hashtag"].sub(
                lambda w: self.handle_hashtag_match(w), doc)

        ###########################
        # handle special cases
        ###########################
        if self.mode != "fast":
            if "allcaps" in self.include_tags:
                doc = self.regexes["allcaps"].sub(
                    lambda w: self.handle_generic_match(
                        w, "allcaps", mode=self.all_caps_tag), doc)

            if "elongated" in self.include_tags:
                doc = self.regexes["elongated"].sub(
                    lambda w: self.handle_elongated_match(w), doc)

            if "repeated" in self.include_tags:
                doc = self.regexes["repeat_puncts"].sub(
                    lambda w: self.handle_repeated_puncts(w), doc)

            if "emphasis" in self.include_tags:
                doc = self.regexes["emphasis"].sub(
                    lambda w: self.handle_emphasis_match(w), doc)

            if "censored" in self.include_tags:
                doc = self.regexes["censored"].sub(
                    lambda w: self.handle_generic_match(w, "censored"), doc)

        ###########################
        # unpack contractions: i'm -> i am, can't -> can not...
        ###########################

        # remove textacy dependency
        if self.unpack_contractions:
            doc = unpack_contractions(doc)

        # omit allcaps if inside hashtags
        doc = re.sub(r' +', ' ', doc)  # remove repeating spaces
        # doc = re.sub(r'<hashtag><allcaps>', '<hashtag>', doc)  # remove repeating spaces
        # doc = doc.replace('<hashtag> <allcaps>', '<hashtag>')
        # doc = doc.replace('</allcaps> </hashtag>', '</hashtag>')

        ###########################
        # Tokenize
        ###########################
        doc = self.remove_hashtag_allcaps(doc.split())
        doc = " ".join(doc)  # normalize whitespace
        if self.tokenizer:
            doc = self.tokenizer(doc)

            # Replace tokens with special dictionaries (slang,emoticons ...)
            # todo: add spell check before!
            if self.dicts:
                for d in self.dicts:
                    doc = self.dict_replace(doc, d)

        return doc

    def pre_process_docs(self, docs, lazy=True):
        from tqdm import tqdm
        for d in tqdm(docs, desc="PreProcessing..."):
            yield self.pre_process_doc(d)
예제 #12
0
def clean_tweets(df):
    # define the text preprocessro
    text_processor = TextPreProcessor(
        # terms that will be normalized
        normalize=['url', 'email', 'money', 'phone', 'time', 'date'],
        # terms that will be annotated
        annotate={
            "hashtag", "allcaps", "elongated", "repeated", 'emphasis',
            'censored'
        },
        fix_html=True,  # fix HTML tokens

        # corpus from which the word statistics are going to be used
        # for word segmentation
        segmenter="twitter",

        # corpus from which the word statistics are going to be used
        # for spell correction
        corrector="twitter",
        unpack_hashtags=True,  # perform word segmentation on hashtags
        unpack_contractions=True,  # Unpack contractions (can't -> can not)
        spell_correct_elong=False,  # spell correction for elongated words

        # select a tokenizer. You can use SocialTokenizer, or pass your own
        # the tokenizer, should take as input a string and return a list of tokens
        #tokenizer=SocialTokenizer(lowercase=True).tokenize,
        tokenizer=TweetTokenizer().tokenize,

        # list of dictionaries, for replacing tokens extracted from the text,
        # with other expressions. You can pass more than one dictionaries.
        dicts=[emoticons])
    seg = Segmenter(corpus="twitter")

    tweet_text = df.tweet_text.to_list()

    clean_tweets = []
    for tweet in tweet_text:

        # manually tag usernames
        # ex: @DoctorChristian -> <user> doctor christian </user>
        match = re.findall(r'@\w+', tweet)

        try:
            for at in match:
                user_seg = seg.segment(at[1:])
                tweet = tweet.replace(at, '<user> ' + user_seg + ' </user>')
        except:
            None

        # manually tag all caps so that the unpack_contractions functions works
        match = re.findall(r"(?<![#@$])\b([A-Z][A-Z ,.']*[A-Z])\b", tweet)

        try:
            for all_caps in match:
                tweet = tweet.replace(
                    all_caps, '<allcaps> ' + all_caps.lower() + ' </allcaps>')
        except:
            None

        # manually tag percentages
        match = re.findall(r"(\d+.?\d?%)", tweet)

        try:
            for percent in match:
                tweet = tweet.replace(
                    percent,
                    '<percent> ' + percent[0:len(percent) - 1] + ' </percent>')
        except:
            None

        # deal with contractions that the tool misses
        tweet = re.sub(
            r"(\b)([Ww]hat|[Ii]t|[Hh]e|[Ss]he|[Tt]hat|[Tt]here|[Hh]ow|[Ww]ho|[Hh]ere|[Ww]here|[Ww]hen)'s",
            r"\1\2 is", tweet)
        tweet = re.sub(r"(\b)([Aa]in)'t", r"is not", tweet)
        tweet = re.sub(r"(\b)([Ww]asn)'t", r"was not", tweet)
        tweet = re.sub(r"(\b)([Hh]e|[Ss]he|[Ii]|[Yy]ou|[Tt]hey|[Ww]e)'d",
                       r"\1\2 would", tweet)
        tweet = re.sub(r"(\b)([Ii]t|[Tt]hat|[Tt]his)'ll", r"\1\2 will", tweet)
        tweet = re.sub(r"(\b)([Cc])'mon", r"come on", tweet)

        # process the rest of the tweet with the nltk tweet tokenizer
        tweet = " ".join(text_processor.pre_process_doc(tweet)).lower()

        clean_tweets.append(tweet)

    # below is code to create the tsv file of cleaned tweets
    df['tweet_text'] = clean_tweets

    return df
예제 #13
0
class TextPreProcessor:
    """
    Kwargs:
        normalize (list)
            possible values: ['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date']

        annotate (list)
            possible values: ['hashtag', 'allcaps', 'elongated', 'repeated', 'emphasis', 'censored']

        unpack_hashtags (bool)

        unpack_contractions (bool)

        segmenter (str): define the statistics of what corpus you would
            like to use [english, twitter]

        corrector (str): define the statistics of what corpus you would
            like to use [english, twitter]

        tokenizer (callable): callable function that accepts a string and
                returns a list of strings if no tokenizer is provided then
                the text will be tokenized on whitespace

        simplify_emoticons (bool)

        dictionaries (list)
    """

    def __init__(self, **kwargs):
        self.tokens_to_normalize = kwargs.get("normalize", [])
        self.annotate = kwargs.get("annotate", [])
        self.unpack_hashtags = kwargs.get("unpack_hashtags", False)
        self.unpack_contractions = kwargs.get("unpack_contractions", False)
        self.segmenter_corpus = kwargs.get("segmenter", "english")
        self.corrector_corpus = kwargs.get("corrector", "english")
        self.segmenter = Segmenter(corpus=self.segmenter_corpus)
        self.spell_corrector = SpellCorrector(corpus=self.corrector_corpus)
        self.tokenizer = kwargs.get("tokenizer", None)
        self.simplify_emoticons = kwargs.get("simplify_emoticons", False)
        self.dictionaries = kwargs.get("dictionaries", [])
        self.stats = {}
        self.preprocessed_texts = -1

    def pre_process(self, text: str, with_stats=False):
        self._increment_counter()

        text = self._remove_repeating_spaces(text)
        text = self._normalize(text)
        text = self._unpack_hashtags(text)
        text = self._annotate(text)
        text = self._unpack_contractions(text)
        text = self._remove_repeating_spaces(text)

        tokens = self._tokenize(text)
        tokens = self._simplify_emoticons(tokens)
        tokens = self._replace_using_dictionaries(tokens)

        if with_stats:
            return tokens, self._pre_processed_text_stats()
        else:
            return tokens

    def _pre_processed_text_stats(self):
        return self.stats[self.preprocessed_texts]

    def _increment_counter(self):
        self.preprocessed_texts += 1
        self.stats[self.preprocessed_texts] = {}

    def _normalize(self, text):
        for item in self.tokens_to_normalize:
            text = self._change_using_regexp(item, lambda m: f' <{item}> ', text, 'normalize')
        return text

    def _unpack_hashtags(self, text):
        if self.unpack_hashtags:
            return self._change_using_regexp("hashtag", lambda w: self._handle_hashtag_match(w), text, "unpack")
        return text

    def _annotate(self, text):
        text = self._annotate_allcaps(text)
        text = self._annotate_elongated(text)
        text = self._annotate_repeated(text)
        text = self._annotate_emphasis(text)
        text = self._annotate_censored(text)
        return text

    def _annotate_allcaps(self, text):
        if "allcaps" in self.annotate:
            return self._change_using_regexp("allcaps", lambda w: self._handle_generic_match(w, "allcaps", mode='wrap'),
                                             text, "annotate")
        return text

    def _annotate_elongated(self, text):
        if "elongated" in self.annotate:
            return self._change_using_regexp("elongated", lambda w: self._handle_elongated_match(w), text, "annotate")
        return text

    def _annotate_repeated(self, text):
        if "repeated" in self.annotate:
            return self._change_using_regexp("repeat_puncts", lambda w: self._handle_repeated_puncts(w), text,
                                             "annotate")
        return text

    def _annotate_emphasis(self, text):
        if "emphasis" in self.annotate:
            return self._change_using_regexp("emphasis", lambda w: self._handle_emphasis_match(w), text, "annotate")
        return text

    def _annotate_censored(self, text):
        if "censored" in self.annotate:
            return self._change_using_regexp("censored", lambda w: self._handle_generic_match(w, "censored"), text,
                                             "annotate")
        return text

    def _change_using_regexp(self, regexp_name, func, text, stats_name_prefix):
        changing_result = regexes[regexp_name].subn(func, text)
        self._update_stats(f'{stats_name_prefix}_{regexp_name}', changing_result[1])
        return changing_result[0]

    def _unpack_contractions(self, text):
        if self.unpack_contractions:
            text = self._unpack_selected_contrations(r"(\b)([Aa]re|[Cc]ould|[Dd]id|[Dd]oes|[Dd]o|[Hh]ad|[Hh]as|[Hh]ave|"
                                                     r"[Ii]s|[Mm]ight|[Mm]ust|[Ss]hould|[Ww]ere|[Ww]ould)n'?t",
                                                     r"\1\2 not", text)

            text = self._unpack_selected_contrations(r"(\b)([Hh]e|[Ii]|[Ss]he|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'ll",
                                                     r"\1\2 will", text)
            text = self._unpack_selected_contrations(r"(\b)([Tt]hey|[Ww]hat|[Ww]ho|[Yy]ou)ll", r"\1\2 will", text)

            text = self._unpack_selected_contrations(r"(\b)([Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'re", r"\1\2 are", text)
            text = self._unpack_selected_contrations(r"(\b)([Tt]hey|[Ww]hat|[Yy]ou)re", r"\1\2 are", text)

            text = self._unpack_selected_contrations(r"(\b)([[Hh]e|[Ss]he)'s", r"\1\2 is", text)

            text = self._unpack_selected_contrations(
                r"(\b)([Ii]|[Ss]hould|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Ww]ould|[Yy]ou)"
                r"'?ve", r"\1\2 have", text)

            text = self._unpack_selected_contrations(r"(\b)([Cc]a)n't", r"\1\2n not", text)
            text = self._unpack_selected_contrations(r"(\b)([Ii])'m", r"\1\2 am", text)
            text = self._unpack_selected_contrations(r"(\b)([Ll]et)'?s", r"\1\2 us", text)
            text = self._unpack_selected_contrations(r"(\b)([Ww])on'?t", r"\1\2ill not", text)
            text = self._unpack_selected_contrations(r"(\b)([Ss])han'?t", r"\1\2hall not", text)
            text = self._unpack_selected_contrations(r"(\b)([Yy])(?:'all|a'll)", r"\1\2ou all", text)

        return text

    def _unpack_selected_contrations(self, regexp, replacement, text):
        unpacking_result = re.subn(regexp, replacement, text)
        self._update_stats("unpack_contrations", unpacking_result[1])
        return unpacking_result[0]

    def _tokenize(self, text):
        if self.tokenizer:
            return self.tokenizer(text)
        else:
            return text.split(' ')

    def _simplify_emoticons(self, tokens):
        if self.simplify_emoticons:
            result = []
            for token in tokens:
                if token in emoticons:
                    new_emoticon = emoticons[token]
                    if new_emoticon != token:
                        self._update_stats('emoticon_simplification', 1)
                    result.append(new_emoticon)
                else:
                    result.append(token)
            return result
        else:
            return tokens

    def _replace_using_dictionaries(self, tokens):
        if len(self.dictionaries) > 0:
            for dictionary in self.dictionaries:
                for idx, token in enumerate(tokens):
                    if token in dictionary:
                        value = dictionary[token]
                        if '<entity>' not in value:
                            tokens[idx] = value
                            self._update_stats('dictionary_replacement', 1)
            return ' '.join(tokens).split(' ')
        else:
            return tokens

    @lru_cache(maxsize=65536)
    def _handle_hashtag_match(self, m):
        text = m.group()[1:]

        if text.islower():
            expanded = self.segmenter.segment(text)
            expanded = " ".join(expanded.split("-"))
            expanded = " ".join(expanded.split("_"))
        else:
            expanded = regexes["camel_split"].sub(r' \1', text)
            expanded = expanded.replace("-", "")
            expanded = expanded.replace("_", "")

        if "hashtag" in self.annotate:
            expanded = self._add_special_tag(expanded, "hashtag", mode="wrap")

        return expanded

    @lru_cache(maxsize=65536)
    def _handle_generic_match(self, m, tag, mode="every"):
        text = m.group()
        if tag == 'allcaps':  # word around for allcaps contractions like YOU'RE TODO refactor
            text = text.lower()

        text = self._add_special_tag(text, tag, mode=mode)

        return text

    def _handle_elongated_match(self, m):
        text = m.group()

        text = regexes["normalize_elong"].sub(r'\1\1', text)

        normalized = self.spell_corrector.normalize_elongated(text)
        if normalized:
            text = normalized

        text = self._add_special_tag(text, "elongated")

        return text

    @lru_cache(maxsize=65536)
    def _handle_repeated_puncts(self, m):
        text = m.group()
        text = "".join(sorted(set(text), reverse=True))
        text = self._add_special_tag(text, "repeated")

        return text

    @lru_cache(maxsize=65536)
    def _handle_emphasis_match(self, m):
        text = m.group().replace("*", "")
        text = self._add_special_tag(text, "emphasis")

        return text

    def _update_stats(self, key, value):
        if value > 0:
            stats_for_text = self.stats[self.preprocessed_texts]

            if key not in stats_for_text:
                stats_for_text[key] = 0
            stats_for_text[key] += value

    @staticmethod
    def _remove_repeating_spaces(text):
        return re.sub(r' +', ' ', text).strip()

    @staticmethod
    def _add_special_tag(m, tag, mode="single"):

        if isinstance(m, str):
            text = m
        else:
            text = m.group()

        if mode == "single":
            return " {} <{}> ".format(text, tag)
        elif mode == "wrap":
            return " ".join([" <{}> {} </{}> ".format(tag, text, tag)]) + " "
        elif mode == "every":
            tokens = text.split()
            processed = " ".join([" {} <{}> ".format(t, tag)
                                  for t in tokens])
            return " " + processed + " "
예제 #14
0
from ekphrasis.classes.segmenter import Segmenter

# segmenter using the word statistics from english Wikipedia
seg_eng = Segmenter(corpus="english")

# segmenter using the word statistics from Twitter
seg_tw = Segmenter(corpus="twitter")

# segmenter using the word statistics from Twitter
seg_tw_2018 = Segmenter(corpus="twitter_2018")

words = [
    "exponentialbackoff", "gamedev", "retrogaming", "thewatercooler",
    "panpsychism"
]
for w in words:
    print(w)
    print("(eng):", seg_eng.segment(w))
    print("(tw):", seg_tw.segment(w))
    print("(tw):", seg_tw_2018.segment(w))
    print()
예제 #15
0
def tokenize_hashtags(hashtags):
    seg_eng = Segmenter(corpus="english")
    hash= ' '.join(seg_eng.segment(hashtags) for h in hashtags)
    return hash
예제 #16
0
def preprocess_corpus(corpus,stemming=False,
                      all_smilies=False, pos_smilies=False, neg_smilies=False, other_smilies=False,
                      hugs_and_kisses=False,hearts=False,
                      hashtag=False, hashtag_mention=False, 
                      numbers=False, number_mention=False, 
                      exclamation=False,  ##OBS denne er nå ikke testet, eventuelt bare fjerne den
                      set_to_not=False, 
                      segmentation_hash= False, 
                      spelling=False,
                      elongation=False, 
                      remove_signs=False
                      ):
    """ Function used to apply preprocessing
    Input:
        corpus: a corpus on the format as the output in creat_corpus. Default False. 
        all_smilies: if true, same effect as if pos_smilies, neg_smilies, and other_smilies were true.Default False.
        pos_smilies: if true, positive smilies such as : ), : (, ; ), ( ;, :p, ;p, : p, are replaced by "possmiley.Default False.
        neg_smilies: if true, negative smilies such as : (, ) : are replaced by "negsmiely".Default False.
        other_smilies: if true, smilies such as ^_^ are replaced by a describing word.Default False. 
        hugs_and_kisses: if true, words such as xxx xoxo etc are replaced by "kisses" or "hug" and "kisses". Default False.
        hearts: if true, "<3" are replaced by "heart".Default False.
        hashtags: if true, hashtags are removed from the beginning of words, so #apple becomes apple.Default False. 
        hashtag_mention: if true, and if hashtag is true, the word "hashatag" is added at the end of a tweet that used to contain
            one or more words beginning with a hashtag. Default False.
        numbers: if true, words that are purely numbers are removed.Default False.
        number_mention: if true, and if number is true, the word "thereisanumber" is added at the end of a tweet that used 
            to contain one or more words that were purely numbers. Default False.
        exclamation: if true, the word "exclamation" is added at the end of a tweet that contain one or more "!".Default False. 
        set_to_not: if true, all words ending with "n't" is replaced by not.Default False. 
        segmentation_hash: if true, words starting with # that do not appear in the english dictionary is split into segments, 
            eg '#iammoving' becomes 'i am moving'. Default False.
        spelling: if true, all words that are not a part of the english dictionary is set to the most likely word,
            within two alterations. Default False.
        elongation: if true, the length of all sequences of letters in words that are not a part of the English dictionary 
            is set to max 2. Before words that are altered because of this, the word 'elongation' appears. Default False.
        remove_signs: if true, signs such as ",", ".", ":", ";", "-", are removed. Default False.
    
    Output:
        new_corpus: a new corpus, on same format as the input corpus. 
    """
   
    start = time.time()
    
    #initialising the new corpus:
    new_corpus=[]

    #Want to split the tweets using this tokenizer:
    tknzr = TweetTokenizer(reduce_len=True)
    
    
    
    if stemming:
        ps = PorterStemmer()
    
    if segmentation_hash or spelling or elongation:
        d = enchant.Dict("en_US")
    
    if segmentation_hash: 
        #seg = Segmenter(corpus="english")
        seg = Segmenter(corpus="twitter")

    if spelling: 
        sp = SpellCorrector(corpus="english")
        
    
    elapsed = time.time()
    print("Time in min before starting first for loop:", (elapsed - start) / 60 )
    
    #Want to go though each line (tweet) in the corpus
    for k, line in enumerate(corpus):
        
        
        if hashtag_mention:
            there_is_hashtag=False
        if number_mention:
            there_is_number=False
        if exclamation:
            there_is_exclamation=False
            
        #Splitting the tweet using the chosen tokenizer. 
        words=tknzr.tokenize(line)
        #Initializing for cleaned_tweet:
        cleaned_tweet=[]
        
        for i, word in enumerate(words):
            #Indicating that the word has not been treated yet
            word_not_treated=True
            end_=len(words)-1
            if ((pos_smilies or all_smilies) and word_not_treated):
                if (i>0 and (word=='d' and (words[i-1]==':' or words[i-1]==';'))) or word == ':d' or word == ';d':
                    cleaned_tweet.append('smile')
                    word_not_treated=False
                elif (i>0 and (word=='p' and (words[i-1]==':' or words[i-1]==';'))) or word == ':p' or word == ';p' :
                    cleaned_tweet.append('smile')
                    word_not_treated=False
                elif i>0 and word=='d' and (words[i-1]==':' or words[i-1]==';' or words[i-1]=='x'):
                    cleaned_tweet.append('smile')
                    word_not_treated=False
                elif i>0 and words[i-1]=='(' and (word==':' or word==';'):
                    cleaned_tweet.append('smile')
                    word_not_treated=False
                elif i>0 and word==')' and (words[i-1]==':' or words[i-1]==';'):
                    cleaned_tweet.append('smile')
                    word_not_treated=False

            if ((neg_smilies or all_smilies) and word_not_treated):
                if i>0 and words[i-1]==')' and (word==':' or word==';'):
                    cleaned_tweet.append('sad')
                    word_not_treated=False
                elif i>0 and word=='(' and (words[i-1]==':' or words[i-1]==';'):
                    cleaned_tweet.append('sad')
                    word_not_treated=False
            
            if ((other_smilies or all_smilies) and word_not_treated):
                if i>0  and i<end_ and word=='_' and words[i-1]=='^' and words[i+1]=='^':
                    cleaned_tweet.append('eyesmiley')
                    word_not_treated=False
                elif i>0 and word=='o' and words[i-1]==':':
                    cleaned_tweet.append('openmouthface')
                    word_not_treated=False
                elif i>0 and word=='/' and words[i-1]==':':
                    cleaned_tweet.append('slashsmiely')
                    word_not_treated=False
                elif i>0 and word=='*' and (words[i-1]==':' or words[i-1]==';'):
                    cleaned_tweet.append('kiss')
                    word_not_treated=False
                
            if ((hugs_and_kisses and word_not_treated)):
                    #want to find hearts, hugs, kisses, etc: 
                if (word == "xoxo" or word == "xo" or word == "xoxoxo" or word == "xxoo"):
                    cleaned_tweet.append('hug')
                    cleaned_tweet.append('kiss')
                    word_not_treated=False
                elif (word=='xx' or word=='xxx'or word=='xxxx'):
                    cleaned_tweet.append('kiss')
                    word_not_treated=False
            
            if ((hearts and word_not_treated)):
                if word == "<3":
                    cleaned_tweet.append('heart')
                    word_not_treated=False
            
            if (hashtag and word_not_treated):
                if word[0]=='#':
                    there_is_hashtag=True
                    if (len(word)>1 and segmentation_hash and not d.check(word[1:])):
                        cleaned_tweet.append(seg.segment(word[1:]))
                    else:
                        cleaned_tweet.append(word[1:])
                    word_not_treated=False
            
            if (numbers and word_not_treated):
                if word.isdigit():
                    there_is_number=True
                    word_not_treated=False
                    
            if (exclamation and word_not_treated):
                if word=='!':
                    there_is_exclamation=True
                    cleaned_tweet.append(word)
                    word_not_treated=False
            
            if (set_to_not and word_not_treated):
                if word[-3:]=='n\'t':
                    cleaned_tweet.append('not')
                    word_not_treated=False
           
            
         
            if (word_not_treated):
                if (not remove_signs) or (remove_signs and ( (word!= '^' and word!=',' and word!='.' and word!=':' 
                                                              and word!='-' and word!='´' and word!=';'and word!=')' 
                                                              and word!='(' and word!='*'))):
                  
                    if ((not word[0].isdigit()) and elongation and not d.check(word) and len(word)>2):
                        new=[]
                        new.append(word[0])
                        for i,letter in enumerate(word):
                            if i>0 and i<len(word)-1: 
                                if not( letter==word[i-1]==word[i+1]):
                                    new.append(letter)
                        new.append(word[-1])
                        new_word=''.join(new)
                        if new_word!= word:
                            cleaned_tweet.append('elongation')
                            word=new_word

                    if spelling and not d.check(word)and len(word)>2: 
                        word=sp.correct(word)
                    if stemming:
                        word=ps.stem(word)

                    
                    cleaned_tweet.append(word)

           
                
        
        if (hashtag_mention and there_is_hashtag) :
            cleaned_tweet.append('hashtag')
        if (number_mention and there_is_number) :
            cleaned_tweet.append('number')
        if (exclamation and there_is_exclamation):
            cleaned_tweet.append('exclamation')
            
            
        new_words = ' '.join(cleaned_tweet)
        new_words = new_words.encode('utf-8')
        new_corpus.append(new_words)
        
        if np.mod(k,25000)==1:
                elapsed = time.time()
                print("Time in min after", k, " tweets:", (elapsed - start) / 60 )

        
    elapsed = time.time()
    print("Time in min total:", (elapsed - start) / 60 )
    return new_corpus       
예제 #17
0
# ekphrasis que es para hacer sentimental analysis en especifico aqui se uso para la segmentacion de hashtags


#Metodo para limpiar tweets quitar caracteres especiales, hashtags y url
def clean_tweet(tweet):
    tweet = re.sub(r"pic.\S+", "", tweet)
    return ' '.join(
        re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ",
               tweet).split())


#Query para los 20 tweets recientes
tweets = query_tweets_from_user("realDonaldTrump", 20)

#Imprimir los tweets limpios
for tweet in tweets:
    print(clean_tweet(tweet.text))
    tweetHashtag = re.findall(r"#(\w+)", tweet.text)
    if tweetHashtag.__len__ != 0:
        hashtagArray.extend(tweetHashtag)
    print("\n")

#El corpus se refiere a las estadisticas que usara para segmentar los hashtags en este caso son de twitter
seg_tw = Segmenter(corpus="twitter")
hashtagArray = []

print("Hashtags Segmention:\n")

for hashtag in hashtagArray:  #
    print("(tw):", seg_tw.segment(hashtag))
예제 #18
0
        'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url',
        'date', 'number'
    ],
    annotate={
        "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored'
    },
    fix_html=True,  # fix HTML tokens
    segmenter="twitter",
    corrector="twitter",
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=True,  # spell correction for elongated words
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    dicts=[emoticons])

seg_tw = Segmenter(corpus="twitter")
sp = SpellCorrector(corpus="twitter")
f1 = open('tokenized_tweets_golbeck.txt', 'w')
c = 1
for line in data:
    a = line.strip().split('\t')
    if len(a) >= 3:
        b = a[2]
        c = a[1]
        b = b.split()
        for i in range(len(b)):
            if b[i].startswith('http'):
                b[i] = '<url>'
        b = ' '.join(b)
        a = text_processor.pre_process_doc(b)
        for i in range(len(a)):
예제 #19
0
warnings.filterwarnings("ignore")

sys.stdout = open("./output/disaster_output.txt", "w")

plt.style.use('ggplot')

nlp = spacy.load('en_core_web_sm')
deselect_stop_words = ['no', 'not']  # we don't consider no and not stop words
for w in deselect_stop_words:
    nlp.vocab[w].is_stop = False

lemmatizer = WordNetLemmatizer()
stop_words = safe_get_stop_words('en')
hashtag_regex = re.compile(r"\#\b[\w\-\_]+\b")
twitter_segmenter = Segmenter(corpus="twitter_2018")
camelcase_regex = re.compile(
    r'((?<=[a-z])[A-Z]|(?<!^)[A-Z](?=[a-z])|[0-9]+|(?<=[0-9\-\_])[A-Za-z]|[\-\_])'
)


# DATA PRE-PROCESSING FUNCTIONS
def unescape_tweet(tweet):
    """Unescaping various chars found in text """
    return html.unescape(tweet)


def strip_html_tags(text):
    """remove html tags from text"""
    soup = BeautifulSoup(text, 'lxml')
    stripped_text = soup.get_text(separator=" ")
Dependency: Preinstalled Dataset for ekphrasis 
"""

import sys
import re
import numpy as np
from enum import Enum
from sklearn import metrics
import tensorflow as tf
from tensorflow.contrib import rnn
from ekphrasis.classes.segmenter import Segmenter
import warnings
warnings.simplefilter("ignore")
# Twitter Hashtag Parser
tw = Segmenter(corpus="twitter")


# Configuration class for training model.
class Configuration:
    num_epochs = 500
    size_batch = 256
    max_time_steps = 40
    LSTM_CT = 4
    LSTM_SZ = 200
    ratio_dropout = 0.95
    embedding_size = 100
    rate_learning = 0.01


class PredictionPhase(Enum):
예제 #21
0
    corrector="twitter",
    unpack_hashtags=True,  # perform word segmentation on hashtags
    # unpack_users=True, # dunno if this is a thing
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words

    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    #tokenizer=SocialTokenizer(lowercase=True).tokenize,
    tokenizer=TweetTokenizer().tokenize,

    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons])

seg = Segmenter(corpus="twitter")

clean_tweets = []
for tweet in data:

    # manually tag usernames
    # ex: @DoctorChristian -> <user> doctor christian </user>
    match = re.findall(r'@\w+', tweet)

    try:
        for at in match:
            user_seg = seg.segment(at[1:])
            tweet = tweet.replace(at, '<user> ' + user_seg + ' </user>')
    except:
        None
예제 #22
0
    # for spell correction
    corrector="twitter",
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=True,  # spell correction for elongated words
    spell_correction=True,

    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,

    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons, slangdict])

segmenter = Segmenter(corpus="twitter")
count = 0
all_texts = []
user_dict = defaultdict(lambda: None)

for file_name in sorted(os.listdir(tweet_path)):
    if file_name.endswith('.json'):
        print('processing ' + file_name)
        with open(tweet_path + file_name, 'r') as tweet_batch:
            tweets = json.load(tweet_batch)
            for tweet in tweets:
                # text = preprocess(tweet['content']['text'])
                tokens = text_processor.pre_process_doc(text)
                tokens = [segmenter.segment(t) for t in tokens]
                text = " ".join(tokens)
                text = process_tags(text).strip()