def __init__(self, **kwargs): self.tokens_to_normalize = kwargs.get("normalize", []) self.annotate = kwargs.get("annotate", []) self.unpack_hashtags = kwargs.get("unpack_hashtags", False) self.unpack_contractions = kwargs.get("unpack_contractions", False) self.segmenter_corpus = kwargs.get("segmenter", "english") self.corrector_corpus = kwargs.get("corrector", "english") self.segmenter = Segmenter(corpus=self.segmenter_corpus) self.spell_corrector = SpellCorrector(corpus=self.corrector_corpus) self.tokenizer = kwargs.get("tokenizer", None) self.simplify_emoticons = kwargs.get("simplify_emoticons", False) self.dictionaries = kwargs.get("dictionaries", []) self.stats = {} self.preprocessed_texts = -1
def hashtag_sentiment(tweet): hash_tag = (re.findall("#([a-zA-Z0-9]{1,25})", tweet)) seg = Segmenter() hashtag_polarity = [] for hashtag in hash_tag: tokens = seg.segment(hashtag) ss = sid.polarity_scores(tokens) if 'not' not in tokens.split(' '): hashtag_polarity.append(ss['compound']) else: hashtag_polarity.append(-ss['compound']) sentiment = 0 if len(hashtag_polarity) > 0: sentiment = round( float(sum(hashtag_polarity) / float(len(hashtag_polarity))), 2) return sentiment
def hashtag_sentiment(tweet): hash_tag = (re.findall("#([a-zA-Z0-9]{1,25})", tweet)) hashtag_polarity = [] seg = Segmenter(corpus="twitter") for hashtag in hash_tag: tokens = seg.segment(hashtag) ss = sid.polarity_scores(tokens) # polarity_scores method of SentimentIntensityAnalyzer # object gives a sentiment dictionary. # which contains pos, neg, neu, and compound scores. if 'not' not in tokens.split(' '): hashtag_polarity.append(ss['compound']) else: hashtag_polarity.append(- ss['compound']) sentiment = 0 if len(hashtag_polarity) > 0: sentiment = round(float(sum(hashtag_polarity) / float(len(hashtag_polarity))), 2) return sentiment
def handle_tweets(df_tweets): seg_eng = Segmenter(corpus="english") texts = list(df_tweets["text"]) #f = open(data_path + "abs_tweets.txt", "w") hashtags = [] clean_tweets = [] for t in texts: pattern = r'#\w+|#\w+$' remove = re.compile(pattern) removed_t = remove.sub(r'', t) matches = re.findall(pattern, t) hashes = [seg_eng.segment(i.lstrip('#').lower()) for i in matches] tweet = tokenizer(removed_t) clean_tweets.append(tweet) hashtags.append(hashes) # f.write(tweet) # f.write("\n") #f.close() return clean_tweets, hashtags
def extractHashtags(dataset): seg_tw = Segmenter(corpus="twitter") stop_words = set(stopwords.words('english')) dataset['hashtags'] = dataset['text'].apply(lambda x: re.findall( r"#(\w+)", x)).apply(lambda x: splitUpTweets(x, seg_tw)) # # Remove stop words in segmented tweet # for i in range(len(dataset['hashtags'])): # if dataset['hashtags'][i] is not None: # dataset['hashtags'][i] = list(filter(lambda a: ((a not in stop_words) & (a != "_")), dataset['hashtags'][i])) return dataset
def segmentation(self): from ekphrasis.classes.segmenter import Segmenter seg_eg = Segmenter(corpus="english") seg_tw = Segmenter(corpus="twitter") self.text = [seg_tw.segment(sent) for sent in self.text] return self.text
import random import re from ekphrasis.classes.segmenter import Segmenter listOfFILEcomments = [] listOfFILEposts = [] listOfFILEtags = [] # for i in range(0,17): # listOfFILEcomments.append('allcomments' + str(i) + '.txt') # listOfFILEposts.append('allposts' + str(i) + '.txt') # listOfFILEtags.append('alltags' + str(i) + '.txt') seg_eng = Segmenter(corpus="english") listOfFILEcomments = ['allcomments.txt'] listOfFILEposts = ['allposts.txt'] listOfFILEtags = ['alltags.txt'] fhc = open('finalallcomments.txt', 'a+') fhp = open('finalallposts.txt', 'a+') fht = open('finalalltags.txt', 'a+') for commentFILE, postFILE, tagFILE in zip(listOfFILEcomments, listOfFILEposts, listOfFILEtags): commentGenerator = open(commentFILE, 'r') postGenerator = open(postFILE, 'r') tagGenerator = open(tagFILE, 'r') for comment, post, tag in zip(commentGenerator, postGenerator, tagGenerator): if comment.strip() and post.strip() and tag.strip(): fhc.write(comment) fhp.write(post) fht.write(';'.join(
def __init__(self, **kwargs): """ Kwargs: omit (list): choose what tokens that you want to omit from the text. possible values: ['email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'hashtag'] Important Notes: 1 - put url at front, if you plan to use it. Messes with the regexes! 2 - if you use hashtag then unpack_hashtags will automatically be set to False normalize (list): choose what tokens that you want to normalize from the text. possible values: ['email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'hashtag'] for example: [email protected] will be transformed to <email> Important Notes: 1 - put url at front, if you plan to use it. Messes with the regexes! 2 - if you use hashtag then unpack_hashtags will automatically be set to False unpack_contractions (bool): Replace *English* contractions in ``text`` str with their unshortened forms for example: can't -> can not, wouldn't -> would not, and so on... unpack_hashtags (bool): split a hashtag to it's constituent words. for example: #ilikedogs -> i like dogs annotate (list): add special tags to special tokens. possible values: ['hashtag', 'allcaps', 'elongated', 'repeated'] for example: [email protected] -> [email protected] <email> tokenizer (callable): callable function that accepts a string and returns a list of strings if no tokenizer is provided then the text will be tokenized on whitespace segmenter (str): define the statistics of what corpus you would like to use [english, twitter] corrector (str): define the statistics of what corpus you would like to use [english, twitter] all_caps_tag (str): how to wrap the capitalized words values [single, wrap, every] Note: applicable only when `allcaps` is included in annotate[] - single: add a tag after the last capitalized word - wrap: wrap all words with opening and closing tags - every: add a tag after each word spell_correct_elong (bool): choose if you want to perform spell correction after the normalization of elongated words. * significantly affects performance (speed) spell_correction (bool): choose if you want to perform spell correction to the text * significantly affects performance (speed) fix_text (bool): choose if you want to fix bad unicode terms and html entities. """ self.omit = kwargs.get("omit", {}) self.backoff = kwargs.get("normalize", {}) self.include_tags = kwargs.get("annotate", {}) self.unpack_contractions = kwargs.get("unpack_contractions", False) self.tokenizer = kwargs.get("tokenizer", None) self.dicts = kwargs.get("dicts", None) self.spell_correction = kwargs.get("spell_correction", False) self.spell_correct_elong = kwargs.get("spell_correct_elong", False) self.fix_text = kwargs.get("fix_bad_unicode", False) self.unpack_hashtags = kwargs.get("unpack_hashtags", False) self.segmenter_corpus = kwargs.get("segmenter", "english") self.corrector_corpus = kwargs.get("corrector", "english") self.all_caps_tag = kwargs.get("all_caps_tag", "wrap") self.mode = kwargs.get("mode", "normal") if self.unpack_hashtags: self.segmenter = Segmenter(corpus=self.segmenter_corpus) if self.mode != "fast": self.spell_corrector = SpellCorrector(corpus=self.corrector_corpus) self.regexes = ExManager().get_compiled() if 'hashtag' in self.omit or 'hashtag' in self.backoff: print("You can't omit/backoff and unpack hashtags!\n " "unpack_hashtags will be set to False") self.unpack_hashtags = False
import pandas as pd import re from nltk.corpus import stopwords import pickle import ekphrasis import numpy as np from ekphrasis.classes.segmenter import Segmenter from process_tweets import tokenizer, ner stop_words = set() stop_words.add("rt") stop_words.add("RT") seg_eng = Segmenter(corpus="english") # get only tweets with notinline hashtags, for inline hashtags it split into words, for not inline, it deleted def get_inline_notinline_htags(list_words): end_wt_punc = re.compile(r'#\w+[^a-zA-Z\d\s]') all_letters = re.compile(r'\w+$') all_letters_wt_punc = re.compile(r'\w+$|\w+[:,.]$') end = -1 inline_ht, not_inline_ht = [], [] for i in range(len(list_words)): if i <= end or not list_words[i].startswith("#"): continue end = i previous_word = None next_word = None if i > 0: previous_word = list_words[i - 1] for j in range(i + 1, len(list_words)):
from nltk.stem import WordNetLemmatizer from nltk.corpus import wordnet # import networkx as nx import os import pickle from data_util.my_stopwords import * from data_util.extract_key import extract_PF from ekphrasis.classes.preprocessor import TextPreProcessor from ekphrasis.classes.tokenizer import SocialTokenizer from ekphrasis.dicts.emoticons import emoticons # from ekphrasis.classes.segmenter import Segmenter from ekphrasis.classes.segmenter import Segmenter # segmenter using the word statistics from english Wikipedia seg_eng = Segmenter(corpus="twitter") # english or twitter from ekphrasis.classes.spellcorrect import SpellCorrector sp = SpellCorrector(corpus="english") # english or twitter alphbet_stopword = ['','b','c','d','e','f','g','h','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','#'] # 斷詞辭典 from nltk.corpus import stopwords as nltk_stopwords nltk_stopwords = set(nltk_stopwords.words("english")) stpwords_list3 = [f.replace("\n","") for f in open("data_util/stopwords.txt","r",encoding = "utf-8").readlines()] stpwords_list3.remove("not") stopwords = list(html_escape_table + stpwords_list2) + list(list(nltk_stopwords) + list(stpwords_list1) + list(stpwords_list3)) stopwords = stopwords + ["."] + alphbet_stopword # stopwords = list(html_escape_table) #+ list(stpwords_list1) + list(stpwords_list3) print("斷詞辭典 已取得")
class TextPreProcessor: def __init__(self, **kwargs): """ Kwargs: omit (list): choose what tokens that you want to omit from the text. possible values: ['email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'hashtag'] Important Notes: 1 - put url at front, if you plan to use it. Messes with the regexes! 2 - if you use hashtag then unpack_hashtags will automatically be set to False normalize (list): choose what tokens that you want to normalize from the text. possible values: ['email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'hashtag'] for example: [email protected] will be transformed to <email> Important Notes: 1 - put url at front, if you plan to use it. Messes with the regexes! 2 - if you use hashtag then unpack_hashtags will automatically be set to False unpack_contractions (bool): Replace *English* contractions in ``text`` str with their unshortened forms for example: can't -> can not, wouldn't -> would not, and so on... unpack_hashtags (bool): split a hashtag to it's constituent words. for example: #ilikedogs -> i like dogs annotate (list): add special tags to special tokens. possible values: ['hashtag', 'allcaps', 'elongated', 'repeated'] for example: [email protected] -> [email protected] <email> tokenizer (callable): callable function that accepts a string and returns a list of strings if no tokenizer is provided then the text will be tokenized on whitespace segmenter (str): define the statistics of what corpus you would like to use [english, twitter] corrector (str): define the statistics of what corpus you would like to use [english, twitter] all_caps_tag (str): how to wrap the capitalized words values [single, wrap, every] Note: applicable only when `allcaps` is included in annotate[] - single: add a tag after the last capitalized word - wrap: wrap all words with opening and closing tags - every: add a tag after each word spell_correct_elong (bool): choose if you want to perform spell correction after the normalization of elongated words. * significantly affects performance (speed) spell_correction (bool): choose if you want to perform spell correction to the text * significantly affects performance (speed) fix_text (bool): choose if you want to fix bad unicode terms and html entities. """ self.omit = kwargs.get("omit", {}) self.backoff = kwargs.get("normalize", {}) self.include_tags = kwargs.get("annotate", {}) self.unpack_contractions = kwargs.get("unpack_contractions", False) self.tokenizer = kwargs.get("tokenizer", None) self.dicts = kwargs.get("dicts", None) self.spell_correction = kwargs.get("spell_correction", False) self.spell_correct_elong = kwargs.get("spell_correct_elong", False) self.fix_text = kwargs.get("fix_bad_unicode", False) self.unpack_hashtags = kwargs.get("unpack_hashtags", False) self.segmenter_corpus = kwargs.get("segmenter", "english") self.corrector_corpus = kwargs.get("corrector", "english") self.all_caps_tag = kwargs.get("all_caps_tag", "wrap") self.mode = kwargs.get("mode", "normal") if self.unpack_hashtags: self.segmenter = Segmenter(corpus=self.segmenter_corpus) if self.mode != "fast": self.spell_corrector = SpellCorrector(corpus=self.corrector_corpus) self.regexes = ExManager().get_compiled() if 'hashtag' in self.omit or 'hashtag' in self.backoff: print("You can't omit/backoff and unpack hashtags!\n " "unpack_hashtags will be set to False") self.unpack_hashtags = False def __copy__(self): return self def __deepcopy__(self, memo): return self @staticmethod def add_special_tag(m, tag, mode="single"): if isinstance(m, str): text = m else: text = m.group() if mode == "single": return " {} <{}> ".format(text, tag) elif mode == "wrap": return " ".join([" <{}> {} </{}> ".format(tag, text, tag)]) + " " elif mode == "every": tokens = text.split() processed = " ".join([" {} <{}> ".format(t, tag) for t in tokens]) return " " + processed + " " @lru_cache(maxsize=4096) def handle_hashtag_match(self, m): """ Break a string to its constituent words (using Viterbi algorithm) """ text = m.group()[1:] # todo:simplify routine if text.islower(): expanded = self.segmenter.segment(text) expanded = " ".join(expanded.split("-")) expanded = " ".join(expanded.split("_")) # print(m.group(), " - ", expanded) # with open("analysis/segmenter_" + # self.segmenter_corpus + ".txt", "a") as f: # f.write(m.group() + "\t" + expanded + "\n") else: # split words following CamelCase convention expanded = self.regexes["camel_split"].sub(r' \1', text) expanded = expanded.replace("-", "") expanded = expanded.replace("_", "") # print(m.group(), " - ", expanded) if "hashtag" in self.include_tags: expanded = self.add_special_tag(expanded, "hashtag", mode="wrap") return expanded def handle_elongated_match(self, m): text = m.group() # normalize to at most 2 repeating chars text = self.regexes["normalize_elong"].sub(r'\1\1', text) normalized = self.spell_corrector.normalize_elongated(text) if normalized: text = normalized # try to spell correct the word if self.spell_correct_elong: text = self.spell_corrector.correct_word(text, assume_wrong=True, fast=True) # with open("analysis/spell_corrector_" + # self.corrector_corpus + ".txt", "a") as f: # f.write(m.group() + " - " + text + "\n") # print(m.group(), "-", text) if "elongated" in self.include_tags: text = self.add_special_tag(text, "elongated") return text @lru_cache(maxsize=4096) def handle_repeated_puncts(self, m): """ return the sorted set so mathes random combinations of puncts will be mapped to the same token "!??!?!!", "?!!!!?!", "!!?", "!?!?" --> "?!" "!...", "...?!" --> ".!" :param m: :return: """ text = m.group() text = "".join(sorted(set(text), reverse=True)) if "repeated" in self.include_tags: text = self.add_special_tag(text, "repeated") return text @lru_cache(maxsize=4096) def handle_generic_match(self, m, tag, mode="every"): """ Args: m (): tag (): mode (): Returns: """ text = m.group() text = self.add_special_tag(text, tag, mode=mode) return text @lru_cache(maxsize=4096) def handle_emphasis_match(self, m): """ :param m: :return: """ text = m.group().replace("*", "") if "emphasis" in self.include_tags: text = self.add_special_tag(text, "emphasis") return text @staticmethod def dict_replace(wordlist, _dict): return [_dict[w] if w in _dict else w for w in wordlist] @staticmethod def remove_hashtag_allcaps(wordlist): in_hashtag = False _words = [] for word in wordlist: if word == "<hashtag>": in_hashtag = True elif word == "</hashtag>": in_hashtag = False elif word in {"<allcaps>", "</allcaps>"} and in_hashtag: continue _words.append(word) return _words @lru_cache(maxsize=4096) def handle_general_word_segment_and_spelling(self, m): """ :param m: :return: """ text = m.group() text = self.segmenter.segment(text) return text def pre_process_doc(self, doc): doc = re.sub(r' +', ' ', doc) # remove repeating spaces # ########################### # # fix bad unicode # ########################### # if self.fix_bad_unicode: # doc = textacy.preprocess.fix_bad_unicode(doc) # # ########################### # # fix html leftovers # ########################### # doc = html.unescape(doc) ########################### # fix text ########################### if self.fix_text: doc = ftfy.fix_text(doc) ########################### # BACKOFF & OMIT ########################### for item in self.backoff: # better add an extra space after the match. # Just to be safe. extra spaces will be normalized later anyway doc = self.regexes[item].sub( lambda m: " " + "<" + item + ">" + " ", doc) for item in self.omit: doc = doc.replace("<" + item + ">", '') ########################### # segment other words not hashtags ########################### # doc = self.regexes['not_hashtag'].sub( # lambda w: self.handle_general_word_segment_and_spelling(w), doc) # for word in doc.split(" "): # if(not word.startswith('#')): # word = self.segmenter.segment(word) # new_doc.append(word) # doc = " ".join(new_doc) ########################### # unpack hashtags ########################### if self.unpack_hashtags: doc = self.regexes["hashtag"].sub( lambda w: self.handle_hashtag_match(w), doc) ########################### # handle special cases ########################### if self.mode != "fast": if "allcaps" in self.include_tags: doc = self.regexes["allcaps"].sub( lambda w: self.handle_generic_match( w, "allcaps", mode=self.all_caps_tag), doc) if "elongated" in self.include_tags: doc = self.regexes["elongated"].sub( lambda w: self.handle_elongated_match(w), doc) if "repeated" in self.include_tags: doc = self.regexes["repeat_puncts"].sub( lambda w: self.handle_repeated_puncts(w), doc) if "emphasis" in self.include_tags: doc = self.regexes["emphasis"].sub( lambda w: self.handle_emphasis_match(w), doc) if "censored" in self.include_tags: doc = self.regexes["censored"].sub( lambda w: self.handle_generic_match(w, "censored"), doc) ########################### # unpack contractions: i'm -> i am, can't -> can not... ########################### # remove textacy dependency if self.unpack_contractions: doc = unpack_contractions(doc) # omit allcaps if inside hashtags doc = re.sub(r' +', ' ', doc) # remove repeating spaces # doc = re.sub(r'<hashtag><allcaps>', '<hashtag>', doc) # remove repeating spaces # doc = doc.replace('<hashtag> <allcaps>', '<hashtag>') # doc = doc.replace('</allcaps> </hashtag>', '</hashtag>') ########################### # Tokenize ########################### doc = self.remove_hashtag_allcaps(doc.split()) doc = " ".join(doc) # normalize whitespace if self.tokenizer: doc = self.tokenizer(doc) # Replace tokens with special dictionaries (slang,emoticons ...) # todo: add spell check before! if self.dicts: for d in self.dicts: doc = self.dict_replace(doc, d) return doc def pre_process_docs(self, docs, lazy=True): from tqdm import tqdm for d in tqdm(docs, desc="PreProcessing..."): yield self.pre_process_doc(d)
def clean_tweets(df): # define the text preprocessro text_processor = TextPreProcessor( # terms that will be normalized normalize=['url', 'email', 'money', 'phone', 'time', 'date'], # terms that will be annotated annotate={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, fix_html=True, # fix HTML tokens # corpus from which the word statistics are going to be used # for word segmentation segmenter="twitter", # corpus from which the word statistics are going to be used # for spell correction corrector="twitter", unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=False, # spell correction for elongated words # select a tokenizer. You can use SocialTokenizer, or pass your own # the tokenizer, should take as input a string and return a list of tokens #tokenizer=SocialTokenizer(lowercase=True).tokenize, tokenizer=TweetTokenizer().tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=[emoticons]) seg = Segmenter(corpus="twitter") tweet_text = df.tweet_text.to_list() clean_tweets = [] for tweet in tweet_text: # manually tag usernames # ex: @DoctorChristian -> <user> doctor christian </user> match = re.findall(r'@\w+', tweet) try: for at in match: user_seg = seg.segment(at[1:]) tweet = tweet.replace(at, '<user> ' + user_seg + ' </user>') except: None # manually tag all caps so that the unpack_contractions functions works match = re.findall(r"(?<![#@$])\b([A-Z][A-Z ,.']*[A-Z])\b", tweet) try: for all_caps in match: tweet = tweet.replace( all_caps, '<allcaps> ' + all_caps.lower() + ' </allcaps>') except: None # manually tag percentages match = re.findall(r"(\d+.?\d?%)", tweet) try: for percent in match: tweet = tweet.replace( percent, '<percent> ' + percent[0:len(percent) - 1] + ' </percent>') except: None # deal with contractions that the tool misses tweet = re.sub( r"(\b)([Ww]hat|[Ii]t|[Hh]e|[Ss]he|[Tt]hat|[Tt]here|[Hh]ow|[Ww]ho|[Hh]ere|[Ww]here|[Ww]hen)'s", r"\1\2 is", tweet) tweet = re.sub(r"(\b)([Aa]in)'t", r"is not", tweet) tweet = re.sub(r"(\b)([Ww]asn)'t", r"was not", tweet) tweet = re.sub(r"(\b)([Hh]e|[Ss]he|[Ii]|[Yy]ou|[Tt]hey|[Ww]e)'d", r"\1\2 would", tweet) tweet = re.sub(r"(\b)([Ii]t|[Tt]hat|[Tt]his)'ll", r"\1\2 will", tweet) tweet = re.sub(r"(\b)([Cc])'mon", r"come on", tweet) # process the rest of the tweet with the nltk tweet tokenizer tweet = " ".join(text_processor.pre_process_doc(tweet)).lower() clean_tweets.append(tweet) # below is code to create the tsv file of cleaned tweets df['tweet_text'] = clean_tweets return df
class TextPreProcessor: """ Kwargs: normalize (list) possible values: ['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date'] annotate (list) possible values: ['hashtag', 'allcaps', 'elongated', 'repeated', 'emphasis', 'censored'] unpack_hashtags (bool) unpack_contractions (bool) segmenter (str): define the statistics of what corpus you would like to use [english, twitter] corrector (str): define the statistics of what corpus you would like to use [english, twitter] tokenizer (callable): callable function that accepts a string and returns a list of strings if no tokenizer is provided then the text will be tokenized on whitespace simplify_emoticons (bool) dictionaries (list) """ def __init__(self, **kwargs): self.tokens_to_normalize = kwargs.get("normalize", []) self.annotate = kwargs.get("annotate", []) self.unpack_hashtags = kwargs.get("unpack_hashtags", False) self.unpack_contractions = kwargs.get("unpack_contractions", False) self.segmenter_corpus = kwargs.get("segmenter", "english") self.corrector_corpus = kwargs.get("corrector", "english") self.segmenter = Segmenter(corpus=self.segmenter_corpus) self.spell_corrector = SpellCorrector(corpus=self.corrector_corpus) self.tokenizer = kwargs.get("tokenizer", None) self.simplify_emoticons = kwargs.get("simplify_emoticons", False) self.dictionaries = kwargs.get("dictionaries", []) self.stats = {} self.preprocessed_texts = -1 def pre_process(self, text: str, with_stats=False): self._increment_counter() text = self._remove_repeating_spaces(text) text = self._normalize(text) text = self._unpack_hashtags(text) text = self._annotate(text) text = self._unpack_contractions(text) text = self._remove_repeating_spaces(text) tokens = self._tokenize(text) tokens = self._simplify_emoticons(tokens) tokens = self._replace_using_dictionaries(tokens) if with_stats: return tokens, self._pre_processed_text_stats() else: return tokens def _pre_processed_text_stats(self): return self.stats[self.preprocessed_texts] def _increment_counter(self): self.preprocessed_texts += 1 self.stats[self.preprocessed_texts] = {} def _normalize(self, text): for item in self.tokens_to_normalize: text = self._change_using_regexp(item, lambda m: f' <{item}> ', text, 'normalize') return text def _unpack_hashtags(self, text): if self.unpack_hashtags: return self._change_using_regexp("hashtag", lambda w: self._handle_hashtag_match(w), text, "unpack") return text def _annotate(self, text): text = self._annotate_allcaps(text) text = self._annotate_elongated(text) text = self._annotate_repeated(text) text = self._annotate_emphasis(text) text = self._annotate_censored(text) return text def _annotate_allcaps(self, text): if "allcaps" in self.annotate: return self._change_using_regexp("allcaps", lambda w: self._handle_generic_match(w, "allcaps", mode='wrap'), text, "annotate") return text def _annotate_elongated(self, text): if "elongated" in self.annotate: return self._change_using_regexp("elongated", lambda w: self._handle_elongated_match(w), text, "annotate") return text def _annotate_repeated(self, text): if "repeated" in self.annotate: return self._change_using_regexp("repeat_puncts", lambda w: self._handle_repeated_puncts(w), text, "annotate") return text def _annotate_emphasis(self, text): if "emphasis" in self.annotate: return self._change_using_regexp("emphasis", lambda w: self._handle_emphasis_match(w), text, "annotate") return text def _annotate_censored(self, text): if "censored" in self.annotate: return self._change_using_regexp("censored", lambda w: self._handle_generic_match(w, "censored"), text, "annotate") return text def _change_using_regexp(self, regexp_name, func, text, stats_name_prefix): changing_result = regexes[regexp_name].subn(func, text) self._update_stats(f'{stats_name_prefix}_{regexp_name}', changing_result[1]) return changing_result[0] def _unpack_contractions(self, text): if self.unpack_contractions: text = self._unpack_selected_contrations(r"(\b)([Aa]re|[Cc]ould|[Dd]id|[Dd]oes|[Dd]o|[Hh]ad|[Hh]as|[Hh]ave|" r"[Ii]s|[Mm]ight|[Mm]ust|[Ss]hould|[Ww]ere|[Ww]ould)n'?t", r"\1\2 not", text) text = self._unpack_selected_contrations(r"(\b)([Hh]e|[Ii]|[Ss]he|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'ll", r"\1\2 will", text) text = self._unpack_selected_contrations(r"(\b)([Tt]hey|[Ww]hat|[Ww]ho|[Yy]ou)ll", r"\1\2 will", text) text = self._unpack_selected_contrations(r"(\b)([Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'re", r"\1\2 are", text) text = self._unpack_selected_contrations(r"(\b)([Tt]hey|[Ww]hat|[Yy]ou)re", r"\1\2 are", text) text = self._unpack_selected_contrations(r"(\b)([[Hh]e|[Ss]he)'s", r"\1\2 is", text) text = self._unpack_selected_contrations( r"(\b)([Ii]|[Ss]hould|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Ww]ould|[Yy]ou)" r"'?ve", r"\1\2 have", text) text = self._unpack_selected_contrations(r"(\b)([Cc]a)n't", r"\1\2n not", text) text = self._unpack_selected_contrations(r"(\b)([Ii])'m", r"\1\2 am", text) text = self._unpack_selected_contrations(r"(\b)([Ll]et)'?s", r"\1\2 us", text) text = self._unpack_selected_contrations(r"(\b)([Ww])on'?t", r"\1\2ill not", text) text = self._unpack_selected_contrations(r"(\b)([Ss])han'?t", r"\1\2hall not", text) text = self._unpack_selected_contrations(r"(\b)([Yy])(?:'all|a'll)", r"\1\2ou all", text) return text def _unpack_selected_contrations(self, regexp, replacement, text): unpacking_result = re.subn(regexp, replacement, text) self._update_stats("unpack_contrations", unpacking_result[1]) return unpacking_result[0] def _tokenize(self, text): if self.tokenizer: return self.tokenizer(text) else: return text.split(' ') def _simplify_emoticons(self, tokens): if self.simplify_emoticons: result = [] for token in tokens: if token in emoticons: new_emoticon = emoticons[token] if new_emoticon != token: self._update_stats('emoticon_simplification', 1) result.append(new_emoticon) else: result.append(token) return result else: return tokens def _replace_using_dictionaries(self, tokens): if len(self.dictionaries) > 0: for dictionary in self.dictionaries: for idx, token in enumerate(tokens): if token in dictionary: value = dictionary[token] if '<entity>' not in value: tokens[idx] = value self._update_stats('dictionary_replacement', 1) return ' '.join(tokens).split(' ') else: return tokens @lru_cache(maxsize=65536) def _handle_hashtag_match(self, m): text = m.group()[1:] if text.islower(): expanded = self.segmenter.segment(text) expanded = " ".join(expanded.split("-")) expanded = " ".join(expanded.split("_")) else: expanded = regexes["camel_split"].sub(r' \1', text) expanded = expanded.replace("-", "") expanded = expanded.replace("_", "") if "hashtag" in self.annotate: expanded = self._add_special_tag(expanded, "hashtag", mode="wrap") return expanded @lru_cache(maxsize=65536) def _handle_generic_match(self, m, tag, mode="every"): text = m.group() if tag == 'allcaps': # word around for allcaps contractions like YOU'RE TODO refactor text = text.lower() text = self._add_special_tag(text, tag, mode=mode) return text def _handle_elongated_match(self, m): text = m.group() text = regexes["normalize_elong"].sub(r'\1\1', text) normalized = self.spell_corrector.normalize_elongated(text) if normalized: text = normalized text = self._add_special_tag(text, "elongated") return text @lru_cache(maxsize=65536) def _handle_repeated_puncts(self, m): text = m.group() text = "".join(sorted(set(text), reverse=True)) text = self._add_special_tag(text, "repeated") return text @lru_cache(maxsize=65536) def _handle_emphasis_match(self, m): text = m.group().replace("*", "") text = self._add_special_tag(text, "emphasis") return text def _update_stats(self, key, value): if value > 0: stats_for_text = self.stats[self.preprocessed_texts] if key not in stats_for_text: stats_for_text[key] = 0 stats_for_text[key] += value @staticmethod def _remove_repeating_spaces(text): return re.sub(r' +', ' ', text).strip() @staticmethod def _add_special_tag(m, tag, mode="single"): if isinstance(m, str): text = m else: text = m.group() if mode == "single": return " {} <{}> ".format(text, tag) elif mode == "wrap": return " ".join([" <{}> {} </{}> ".format(tag, text, tag)]) + " " elif mode == "every": tokens = text.split() processed = " ".join([" {} <{}> ".format(t, tag) for t in tokens]) return " " + processed + " "
from ekphrasis.classes.segmenter import Segmenter # segmenter using the word statistics from english Wikipedia seg_eng = Segmenter(corpus="english") # segmenter using the word statistics from Twitter seg_tw = Segmenter(corpus="twitter") # segmenter using the word statistics from Twitter seg_tw_2018 = Segmenter(corpus="twitter_2018") words = [ "exponentialbackoff", "gamedev", "retrogaming", "thewatercooler", "panpsychism" ] for w in words: print(w) print("(eng):", seg_eng.segment(w)) print("(tw):", seg_tw.segment(w)) print("(tw):", seg_tw_2018.segment(w)) print()
def tokenize_hashtags(hashtags): seg_eng = Segmenter(corpus="english") hash= ' '.join(seg_eng.segment(hashtags) for h in hashtags) return hash
def preprocess_corpus(corpus,stemming=False, all_smilies=False, pos_smilies=False, neg_smilies=False, other_smilies=False, hugs_and_kisses=False,hearts=False, hashtag=False, hashtag_mention=False, numbers=False, number_mention=False, exclamation=False, ##OBS denne er nå ikke testet, eventuelt bare fjerne den set_to_not=False, segmentation_hash= False, spelling=False, elongation=False, remove_signs=False ): """ Function used to apply preprocessing Input: corpus: a corpus on the format as the output in creat_corpus. Default False. all_smilies: if true, same effect as if pos_smilies, neg_smilies, and other_smilies were true.Default False. pos_smilies: if true, positive smilies such as : ), : (, ; ), ( ;, :p, ;p, : p, are replaced by "possmiley.Default False. neg_smilies: if true, negative smilies such as : (, ) : are replaced by "negsmiely".Default False. other_smilies: if true, smilies such as ^_^ are replaced by a describing word.Default False. hugs_and_kisses: if true, words such as xxx xoxo etc are replaced by "kisses" or "hug" and "kisses". Default False. hearts: if true, "<3" are replaced by "heart".Default False. hashtags: if true, hashtags are removed from the beginning of words, so #apple becomes apple.Default False. hashtag_mention: if true, and if hashtag is true, the word "hashatag" is added at the end of a tweet that used to contain one or more words beginning with a hashtag. Default False. numbers: if true, words that are purely numbers are removed.Default False. number_mention: if true, and if number is true, the word "thereisanumber" is added at the end of a tweet that used to contain one or more words that were purely numbers. Default False. exclamation: if true, the word "exclamation" is added at the end of a tweet that contain one or more "!".Default False. set_to_not: if true, all words ending with "n't" is replaced by not.Default False. segmentation_hash: if true, words starting with # that do not appear in the english dictionary is split into segments, eg '#iammoving' becomes 'i am moving'. Default False. spelling: if true, all words that are not a part of the english dictionary is set to the most likely word, within two alterations. Default False. elongation: if true, the length of all sequences of letters in words that are not a part of the English dictionary is set to max 2. Before words that are altered because of this, the word 'elongation' appears. Default False. remove_signs: if true, signs such as ",", ".", ":", ";", "-", are removed. Default False. Output: new_corpus: a new corpus, on same format as the input corpus. """ start = time.time() #initialising the new corpus: new_corpus=[] #Want to split the tweets using this tokenizer: tknzr = TweetTokenizer(reduce_len=True) if stemming: ps = PorterStemmer() if segmentation_hash or spelling or elongation: d = enchant.Dict("en_US") if segmentation_hash: #seg = Segmenter(corpus="english") seg = Segmenter(corpus="twitter") if spelling: sp = SpellCorrector(corpus="english") elapsed = time.time() print("Time in min before starting first for loop:", (elapsed - start) / 60 ) #Want to go though each line (tweet) in the corpus for k, line in enumerate(corpus): if hashtag_mention: there_is_hashtag=False if number_mention: there_is_number=False if exclamation: there_is_exclamation=False #Splitting the tweet using the chosen tokenizer. words=tknzr.tokenize(line) #Initializing for cleaned_tweet: cleaned_tweet=[] for i, word in enumerate(words): #Indicating that the word has not been treated yet word_not_treated=True end_=len(words)-1 if ((pos_smilies or all_smilies) and word_not_treated): if (i>0 and (word=='d' and (words[i-1]==':' or words[i-1]==';'))) or word == ':d' or word == ';d': cleaned_tweet.append('smile') word_not_treated=False elif (i>0 and (word=='p' and (words[i-1]==':' or words[i-1]==';'))) or word == ':p' or word == ';p' : cleaned_tweet.append('smile') word_not_treated=False elif i>0 and word=='d' and (words[i-1]==':' or words[i-1]==';' or words[i-1]=='x'): cleaned_tweet.append('smile') word_not_treated=False elif i>0 and words[i-1]=='(' and (word==':' or word==';'): cleaned_tweet.append('smile') word_not_treated=False elif i>0 and word==')' and (words[i-1]==':' or words[i-1]==';'): cleaned_tweet.append('smile') word_not_treated=False if ((neg_smilies or all_smilies) and word_not_treated): if i>0 and words[i-1]==')' and (word==':' or word==';'): cleaned_tweet.append('sad') word_not_treated=False elif i>0 and word=='(' and (words[i-1]==':' or words[i-1]==';'): cleaned_tweet.append('sad') word_not_treated=False if ((other_smilies or all_smilies) and word_not_treated): if i>0 and i<end_ and word=='_' and words[i-1]=='^' and words[i+1]=='^': cleaned_tweet.append('eyesmiley') word_not_treated=False elif i>0 and word=='o' and words[i-1]==':': cleaned_tweet.append('openmouthface') word_not_treated=False elif i>0 and word=='/' and words[i-1]==':': cleaned_tweet.append('slashsmiely') word_not_treated=False elif i>0 and word=='*' and (words[i-1]==':' or words[i-1]==';'): cleaned_tweet.append('kiss') word_not_treated=False if ((hugs_and_kisses and word_not_treated)): #want to find hearts, hugs, kisses, etc: if (word == "xoxo" or word == "xo" or word == "xoxoxo" or word == "xxoo"): cleaned_tweet.append('hug') cleaned_tweet.append('kiss') word_not_treated=False elif (word=='xx' or word=='xxx'or word=='xxxx'): cleaned_tweet.append('kiss') word_not_treated=False if ((hearts and word_not_treated)): if word == "<3": cleaned_tweet.append('heart') word_not_treated=False if (hashtag and word_not_treated): if word[0]=='#': there_is_hashtag=True if (len(word)>1 and segmentation_hash and not d.check(word[1:])): cleaned_tweet.append(seg.segment(word[1:])) else: cleaned_tweet.append(word[1:]) word_not_treated=False if (numbers and word_not_treated): if word.isdigit(): there_is_number=True word_not_treated=False if (exclamation and word_not_treated): if word=='!': there_is_exclamation=True cleaned_tweet.append(word) word_not_treated=False if (set_to_not and word_not_treated): if word[-3:]=='n\'t': cleaned_tweet.append('not') word_not_treated=False if (word_not_treated): if (not remove_signs) or (remove_signs and ( (word!= '^' and word!=',' and word!='.' and word!=':' and word!='-' and word!='´' and word!=';'and word!=')' and word!='(' and word!='*'))): if ((not word[0].isdigit()) and elongation and not d.check(word) and len(word)>2): new=[] new.append(word[0]) for i,letter in enumerate(word): if i>0 and i<len(word)-1: if not( letter==word[i-1]==word[i+1]): new.append(letter) new.append(word[-1]) new_word=''.join(new) if new_word!= word: cleaned_tweet.append('elongation') word=new_word if spelling and not d.check(word)and len(word)>2: word=sp.correct(word) if stemming: word=ps.stem(word) cleaned_tweet.append(word) if (hashtag_mention and there_is_hashtag) : cleaned_tweet.append('hashtag') if (number_mention and there_is_number) : cleaned_tweet.append('number') if (exclamation and there_is_exclamation): cleaned_tweet.append('exclamation') new_words = ' '.join(cleaned_tweet) new_words = new_words.encode('utf-8') new_corpus.append(new_words) if np.mod(k,25000)==1: elapsed = time.time() print("Time in min after", k, " tweets:", (elapsed - start) / 60 ) elapsed = time.time() print("Time in min total:", (elapsed - start) / 60 ) return new_corpus
# ekphrasis que es para hacer sentimental analysis en especifico aqui se uso para la segmentacion de hashtags #Metodo para limpiar tweets quitar caracteres especiales, hashtags y url def clean_tweet(tweet): tweet = re.sub(r"pic.\S+", "", tweet) return ' '.join( re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split()) #Query para los 20 tweets recientes tweets = query_tweets_from_user("realDonaldTrump", 20) #Imprimir los tweets limpios for tweet in tweets: print(clean_tweet(tweet.text)) tweetHashtag = re.findall(r"#(\w+)", tweet.text) if tweetHashtag.__len__ != 0: hashtagArray.extend(tweetHashtag) print("\n") #El corpus se refiere a las estadisticas que usara para segmentar los hashtags en este caso son de twitter seg_tw = Segmenter(corpus="twitter") hashtagArray = [] print("Hashtags Segmention:\n") for hashtag in hashtagArray: # print("(tw):", seg_tw.segment(hashtag))
'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number' ], annotate={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, fix_html=True, # fix HTML tokens segmenter="twitter", corrector="twitter", unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=True, # spell correction for elongated words tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons]) seg_tw = Segmenter(corpus="twitter") sp = SpellCorrector(corpus="twitter") f1 = open('tokenized_tweets_golbeck.txt', 'w') c = 1 for line in data: a = line.strip().split('\t') if len(a) >= 3: b = a[2] c = a[1] b = b.split() for i in range(len(b)): if b[i].startswith('http'): b[i] = '<url>' b = ' '.join(b) a = text_processor.pre_process_doc(b) for i in range(len(a)):
warnings.filterwarnings("ignore") sys.stdout = open("./output/disaster_output.txt", "w") plt.style.use('ggplot') nlp = spacy.load('en_core_web_sm') deselect_stop_words = ['no', 'not'] # we don't consider no and not stop words for w in deselect_stop_words: nlp.vocab[w].is_stop = False lemmatizer = WordNetLemmatizer() stop_words = safe_get_stop_words('en') hashtag_regex = re.compile(r"\#\b[\w\-\_]+\b") twitter_segmenter = Segmenter(corpus="twitter_2018") camelcase_regex = re.compile( r'((?<=[a-z])[A-Z]|(?<!^)[A-Z](?=[a-z])|[0-9]+|(?<=[0-9\-\_])[A-Za-z]|[\-\_])' ) # DATA PRE-PROCESSING FUNCTIONS def unescape_tweet(tweet): """Unescaping various chars found in text """ return html.unescape(tweet) def strip_html_tags(text): """remove html tags from text""" soup = BeautifulSoup(text, 'lxml') stripped_text = soup.get_text(separator=" ")
Dependency: Preinstalled Dataset for ekphrasis """ import sys import re import numpy as np from enum import Enum from sklearn import metrics import tensorflow as tf from tensorflow.contrib import rnn from ekphrasis.classes.segmenter import Segmenter import warnings warnings.simplefilter("ignore") # Twitter Hashtag Parser tw = Segmenter(corpus="twitter") # Configuration class for training model. class Configuration: num_epochs = 500 size_batch = 256 max_time_steps = 40 LSTM_CT = 4 LSTM_SZ = 200 ratio_dropout = 0.95 embedding_size = 100 rate_learning = 0.01 class PredictionPhase(Enum):
corrector="twitter", unpack_hashtags=True, # perform word segmentation on hashtags # unpack_users=True, # dunno if this is a thing unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=False, # spell correction for elongated words # select a tokenizer. You can use SocialTokenizer, or pass your own # the tokenizer, should take as input a string and return a list of tokens #tokenizer=SocialTokenizer(lowercase=True).tokenize, tokenizer=TweetTokenizer().tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=[emoticons]) seg = Segmenter(corpus="twitter") clean_tweets = [] for tweet in data: # manually tag usernames # ex: @DoctorChristian -> <user> doctor christian </user> match = re.findall(r'@\w+', tweet) try: for at in match: user_seg = seg.segment(at[1:]) tweet = tweet.replace(at, '<user> ' + user_seg + ' </user>') except: None
# for spell correction corrector="twitter", unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=True, # spell correction for elongated words spell_correction=True, # select a tokenizer. You can use SocialTokenizer, or pass your own # the tokenizer, should take as input a string and return a list of tokens tokenizer=SocialTokenizer(lowercase=True).tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=[emoticons, slangdict]) segmenter = Segmenter(corpus="twitter") count = 0 all_texts = [] user_dict = defaultdict(lambda: None) for file_name in sorted(os.listdir(tweet_path)): if file_name.endswith('.json'): print('processing ' + file_name) with open(tweet_path + file_name, 'r') as tweet_batch: tweets = json.load(tweet_batch) for tweet in tweets: # text = preprocess(tweet['content']['text']) tokens = text_processor.pre_process_doc(text) tokens = [segmenter.segment(t) for t in tokens] text = " ".join(tokens) text = process_tags(text).strip()