def find_npnp_patterns(self, wordpair, doc): # TODO Make an easier switch from head and full NP? # check each sentence and find both words in word pair. # if they are both in the sentence, then create a pattern. # Creating pattern: if adjacent, <X><Y> or <Y><X>, if not then <X>blahblah<Y> where X and Y are NPs. # X and Y are NPs but we want to extract the heads of X and Y. # X = anaphor, y = antecedent ret = [] for s in doc.sentences: tokens = [tok.token for tok in s.words] if wordpair.anaphor.token in tokens and wordpair.antecedent.token in tokens: pattern_str1 = '(' + wordpair.anaphor.token + ')(.*)(' + wordpair.antecedent.token + ')' pattern_str2 = '(' + wordpair.antecedent.token + ')(.*)(' + wordpair.anaphor.token + ')' pattern1 = re.compile(pattern_str1) pattern2 = re.compile(pattern_str2) sent_str = ' '.join([w.token for w in s.words]) match1 = re.search(pattern1, sent_str) match2 = re.search(pattern2, sent_str) if match1: print(match1.group(1, 2, 3)) ret.append(NPNP('<X>(' + match1.group(2) + ')<Y>')) if match2: print(match2.group(1, 2, 3)) ret.append(NPNP('<Y>(' + match2.group(2) + ')<X>')) return ret
def get_emojis_pattern(): try: # UCS-4 emojis_pattern = re.compile(u'([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])') except re.error: # UCS-2 emojis_pattern = re.compile( u'([\u2600-\u27BF])|([\uD83C][\uDF00-\uDFFF])|([\uD83D][\uDC00-\uDE4F])|([\uD83D][\uDE80-\uDEFF])') return emojis_pattern
def get_emojis_pattern(): #emojis_pattern= re.compile(u'['u'\U0001F300-\U0001F64F' u'\U0001F680-\U0001F6FF'u'\u2600-\u26FF\u2700-\u27BF]+',re.UNICODE) emojis_pattern = re.compile("[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002500-\U00002BEF" # chinese char u"\U00002702-\U000027B0" u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" u"\U0001f926-\U0001f937" u"\U00010000-\U0010ffff" u"\u2640-\u2642" u"\u2600-\u2B55" u"\u200d" u"\u23cf" u"\u23e9" u"\u231a" u"\ufe0f" # dingbats u"\u3030" "]+", re.UNICODE) return emojis_pattern
def tokenize(self): filename = r'../static/data/news_threeDays_crawling.csv' with open(filename, 'r', encoding='utf-8') as f: self.texts = f.read() texts = self.texts.replace('\n', '') tokenizer = re.compile(r'[^ㄱ-힣]') self.texts = tokenizer.sub(' ', texts) self.tokens = word_tokenize(self.texts) _arr = [] for token in self.tokens: token_pos = self.okt.pos(token) _ = [txt_tag[0] for txt_tag in token_pos if txt_tag[1] == 'Noun'] if len("".join(_)) > 1: _arr.append("".join(_)) self.noun_tokens = " ".join(_arr) filename = r'../static/data/stopwords.txt' with open(filename, 'r', encoding='utf-8') as f: self.stopword = f.read() print(type(self.stopword)) self.noun_tokens = word_tokenize(self.noun_tokens) self.noun_tokens = [text for text in self.noun_tokens if text not in self.stopword] keyword_list = self.noun_tokens self.freqtxt = pd.Series(dict(FreqDist(keyword_list))).sort_values(ascending=False) c2 = collections.Counter(keyword_list) a = c2.most_common(50) file = open('../static/data/news_threeDays_mining.csv', 'w', encoding='utf-8', newline='') print(file.name) csvfile = csv.writer(file) for row in a: csvfile.writerow(row) file.close() return file
def text_cleaning(text): stop = stopwords.words('english') + [ "would", "could", "also", "one", "ha", "can't", "it's", "i've", "u", "it", "us", "we", "t", "s" ] # define stopwords list # cleaning if pd.isnull(text): return "" text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text) # remove URLs text = text.lower() # to lowercase text = ''.join([i for i in text if not i.isdigit()]) # remove digits # text = text.replace('[^\w\s#@/:%.,_-]', '', flags=re.UNICODE) # remove unicodes and emojis text = re.sub(r'(.)\1+', r'\1\1', text) unis_emojis_pattern = re.compile( pattern="[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) "]+", flags=re.UNICODE) text = unis_emojis_pattern.sub(r' ', text) text = re.sub(r'[^\w\s]', ' ', text) # remove punctuations split_text = text.split() text = ' '.join(x for x in text.split() if x not in stop) # remove stopwords return text
def parseText(text): words = word_tokenize(text, language=QuickViewExtractor.LANGUAGE) regex = re.compile('[^a-zA-Z0-9]') words = [regex.sub('', w).lower() for w in words] words = [w for w in words if w] return words
def get_negations_pattern(): negations_ = {"isn't": "is not", "can't": "can not", "couldn't": "could not", "hasn't": "has not", "hadn't": "had not", "won't": "will not", "wouldn't": "would not", "aren't": "are not", "haven't": "have not", "doesn't": "does not", "didn't": "did not", "don't": "do not", "shouldn't": "should not", "wasn't": "was not", "weren't": "were not", "mightn't": "might not", "mustn't": "must not"} return re.compile(r'\b(' + '|'.join(negations_.keys()) + r')\b')
def parse_tweets(self, tweets): regex = re.compile( '[,#@`:)(\[\]\'%^~=&*+/;<>{}|!?._]|http[,#@`\-:)(\[\]\'%^=&_*+/;<>{}|.!?a-z]*' ) named_entities_tree = '' for tweet in tweets: text = str.lower(str(tweet.processed_text)) text = regex.sub('', text) current_tree = self.parse(pos_tag(word_tokenize(text))) named_entities_tree += str(current_tree) return named_entities_tree
def get_emojis_pattern(): return re.compile( "[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" "]+", flags=re.UNICODE)
def preprocess_text(old_text): """preprocess given text and return str""" new_text = re.sub(r'https?:\/\/.*[\r\n]*', '', old_text) # remove URL before new_text = re.sub(r'<[^>]+>', '', new_text) # remove html (line breaks etc.) new_text = re.sub(re.compile(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)"), '', new_text) # remove email new_text = re.sub(r'#', '', new_text) # remove hash sign from hashtags, hashtag itself remains new_text = re.sub('@[^\s]+', '', new_text) # deletes mentions with @ TODO I think it has no influence new_text = re.sub("[^a-zA-Z]", " ", new_text) # remove remaining special characters words = new_text.lower().split() # do lowercase, split into words #words = [word for word in words if not word in stopwords_english] # remove stop words -> makes results worse #words = [stemmer.stem(word) for word in words] # stemming -> leads to worse results #words = [lemma.lemmatize(word) for word in words] # lemmatization -> leads to worse results # join words list back to one tweet return " ".join(words)
def text_tokenizer(self, text: str): text = re.sub('\S*@\S*\s?', '', text) # remove emails text = re.sub(r'^https?://.*[\r\n]*', '', text, flags=re.MULTILINE) # remove websites words = word_tokenize(text, 'english') words = list(filter(lambda word: len(word) >= self.min_length, words)) # text = (list(map(lambda x: self.stemmer.stem(x), words))) tokens = (list(map(lambda x: self.lemmatizer.lemmatize(x), words))) p = re.compile('[a-zA-Z]+') filtered_tokens = list( filter( lambda token: p.match(token) and len(token) >= self.min_length, tokens)) return filtered_tokens
def preprocess_tweet(text): new_tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', text) # remove URL new_tweet = re.sub(r'<[^>]+>', '', new_tweet) # remove html (line breaks etc.) new_tweet = re.sub( re.compile(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)"), '', new_tweet) # remove email new_tweet = re.sub(r'#', '', new_tweet) # remove hash sign from hashtags new_tweet = re.sub("[^a-zA-Z]", " ", new_tweet) # remove remaining special characters words = new_tweet.lower().split() # do lowercase, split into words words = [word for word in words if not word in stopwords_english] # remove stop words words = [stemmer.stem(word) for word in words] # stemming words = [lemma.lemmatize(word) for word in words] # lemmatization # join words list back to one tweet return " ".join(words)
def get_mentions_pattern(): return re.compile(r'@\w*')
def get_twitter_reserved_words_pattern(): return re.compile(r'(RT|rt|FAV|fav|VIA|via)')
def get_blank_spaces_pattern(): return re.compile(r'\s{2,}|\t')
def get_single_letter_words_pattern(): return re.compile(r'(?<![\w\-])\w(?![\w\-])')
def get_hashtags_pattern(): return re.compile(r'#\w*')
def get_hashtags_pattern(): return re.compile(r'#([^\s]+)')
# @end AccessText # @begin PreprocessFile @desc To preprocess the text data # @in stopwords # @in regexr @as regular_expression # @in TextRead # @out dictionary # @out train_corpus # @out test_corpus en_stopwords = set(stopwords.words('english')) de_stopwords = set(stopwords.words('german')) fr_stopwords = set(stopwords.words('french')) stopwords = en_stopwords | de_stopwords | fr_stopwords regexr = re.compile('([a-z])\w+') file_tokens = (word.lower() for word in text_file) clean_file = [word for word in file_tokens if word not in stopwords] # calculate word frequencies word_frequency = defaultdict(int) for text in text_file: for token in text: word_frequency[token] += 1 # only keep words that occur more than once processed_corpus = [[token for token in text if word_frequency[token] > 1] for text in text_file] # associate each word in the corpus with a unique integer ID dictionary = corpora.Dictionary(processed_corpus) corpus = [dictionary.doc2bow(text) for text in processed_corpus]
def remove_numbers(self): """remove any numbers""" pattern = re.compile(r'[0-9]+') self.text = re.sub(pattern=pattern, repl='', string=self.text) return self
import string import nltk from nltk.corpus import stopwords from nltk import re MIN_YEAR = 1900 MAX_YEAR = 2100 emoji_pattern = re.compile( "[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) "]+", flags=re.UNICODE) my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@.""-,`' def get_url_patern(): return re.compile( r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))' r'[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})') def get_emojis_pattern(): try: # UCS-4 emojis_pattern = re.compile( u'([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])'
def tokenize(s): return tokens_re.findall(s) def preprocess(s): tokens = tokenize(s) return tokens regex_str = [ r'<[^>]+>', # HTML tags r'(?:@[\w_]+)', # @-mentions r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and ' r'(?:[\w_]+)', # other words r'(?:\S)' # anything else ] tokens_re = re.compile(r'(' + '|'.join(regex_str) + ')', re.VERBOSE | re.IGNORECASE) print("Token compilation completed") punctuation = list(string.punctuation) stop = stopwords.words('english') + punctuation + [ 'rt', 'via', '…', 'trump', 'donaldtrump', 'therealdonaldtrump', 'president', "trump's", 'donald' ] print("Stopword list construction completed")
def remove_punctuation(self): """remove any punctutations""" pattern = re.compile(r'[^\w\s]') self.text = re.sub(pattern=pattern, repl='', string=self.text) return self
def clean_text(text): import nltk nltk.download('stopwords') nltk.download('wordnet') # split into words by white space words = text.split() # remove punctuation from each word import string table = str.maketrans('', '', string.punctuation) text = [w.translate(table) for w in words] text = " ".join(text) #print(stripped[:100]) ## Remove puncuation #text = text.translate(string.punctuation) ######################################################################################## # replace urls re_url = re.compile( r"((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\ .([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*", re.MULTILINE | re.UNICODE) # replace ips re_ip = re.compile("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}") # replace URLs text = re_url.sub("URL", text) # replace IPs text = re_ip.sub("IPADDRESS", text) #################################################################### ## Convert words to lower case and split them text = text.lower().split() ## Remove stop words #stops = set(stopwords.words("english")) #text = [w for w in text if not w in stops and len(w) >= 3] text = " ".join(text) ## Clean the text text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) text = re.sub(r"what's", "what is ", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r",", " ", text) text = re.sub(r"\.", " ", text) text = re.sub(r"!", " ! ", text) text = re.sub(r"\/", " ", text) text = re.sub(r"\^", " ^ ", text) text = re.sub(r"\+", " + ", text) text = re.sub(r"\-", " - ", text) text = re.sub(r"\=", " = ", text) text = re.sub(r"'", " ", text) text = re.sub(r"(\d+)(k)", r"\g<1>000", text) text = re.sub(r":", " : ", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r" u s ", " american ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r"e - mail", "email", text) text = re.sub(r"j k", "jk", text) text = re.sub(r"\s{2,}", " ", text) ## Stemming text = text.split() #stemmer = SnowballStemmer('english') #stemmed_words = [stemmer.stem(word) for word in text] lemmatizer = WordNetLemmatizer() lemmatized_words = [lemmatizer.lemmatize(word) for word in text] text = " ".join(lemmatized_words) return text
def __init__(self, pattern): Pattern.__init__(self, pattern) self.is_anaphor_first = pattern.startswith( '<X>') # Track whether anaphor is first self.regex_pattern = re.compile( self.pattern.replace('<X>', '(\S+)').replace('<Y>', '(\S+)'))
def get_url_patern(): return re.compile( r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))' r'[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})')
plt.hist(non_spam_emph, label='Not spam') plt.legend(loc='upper right') plt.title('Emphasized words count - Spam VS not-spam') fig3.show() print('\nURLs exists in %s/%s of the spam docs' % (sum(i > 0 for i in spam_urls), spam_docs_count)) print('URLs exists in %s/%s of the non-spam docs' % (sum(i > 0 for i in non_spam_urls), non_spam_docs_count)) # Pre Processing data lengths = 0 stem_it = True sw = clean_sw() max_features = 800 test_size = .2 # Clean repeating chars - looooooooooooooooooove -> love pattern = re.compile(r"(.)\1{2,}", re.DOTALL) for idx, doc in enumerate(x_train_): doc = strip_url(doc) doc = is_long_number(doc) doc = pattern.sub(r"\1", doc) doc = convert_emphesize(doc) tokens = [english_stemmer(w) for w in text_to_word_sequence(doc, filters=filters, lower=True)] x_train_[idx] = [w for w in tokens if w not in sw] lengths += len(x_train_[idx]) max_len = round(lengths / idx) # Maybe I should get the average length of a spam document VS a non spam document print('Average document length: %s\n' % max_len) x_train, x_test, y_train, y_test = train_test_split(x_train_, y_train,
def remove_twitter_handle(self): """remove twitter handles""" pattern = re.compile(r'RT') self.text = re.sub(pattern=pattern, repl='', string=self.text) return self
def get_twitter_reserved_words_pattern(): return re.compile(r'(RT|FAV|VIA)')
def get_arabic_pattern(): #return re.compile('[\u0627-\u064a]') return re.compile('[\u0600-\u06ff\u0750-\u077f\u08a0-\u08ff]+' )