def tokenize(self, tweet): tweet = remove_handles(tweet) tweet = tweet.replace('#', ' ') tweet = tweet.replace('<', ' ') tweet = tweet.replace('>', ' ') tweet = tweet.replace('&', ' und ') tweet = tweet.replace('|LBR|', ' ') tweet = tweet.replace('-', ' ') tweet = tweet.replace('_', ' ') tweet = tweet.replace("'s", ' ') tweet = tweet.replace(",", ' ') tweet = tweet.replace(";", ' ') tweet = tweet.replace(":", ' ') tweet = tweet.replace("/", ' ') tweet = tweet.replace("+", ' ') tknzr = Tokenizer_NLTK(preserve_case=self.preserve_case, reduce_len=True) if self.join: return " ".join(tknzr.tokenize(tweet)) elif self.use_stemmer: stmmr = Stemmer_NLTK() return [stmmr.stem(token) for token in tknzr.tokenize(tweet)] else: return tknzr.tokenize(tweet)
def read_files(filemap, categories, stopwordsPunctuationList): """Reads every line in the files from get_filenames_in_folder(), tokenizes the lines and creates a bag of words from it with bag_of_words()""" feats = list() print("\n##### Reading files...") for category in categories: files = get_filenames_in_folder(filemap + '/' + category) random.shuffle(files) num_files = 0 for f in files: if not f.startswith('.'): data = open(filemap + '/' + category + '/' + f, 'r', encoding='UTF-8').read() for line in data.split('\n'): line = remove_handles(line) line = re.sub(r"http\S+", "", line) line = tokenizer.tokenize(line) line = ' '.join(line) line = line.lower() line = tknzr.tokenize(line) line = ' '.join(line) line = line.split() # exclude newlines if not line == []: filtered_tokens = [ w for w in line if not w.isdigit() and w not in stopwordsPunctuationList and w not in ['bitcoin', 'Bitcoin', 'btc', 'BTC'] ] #print(filtered_tokens) bag = bag_of_words(filtered_tokens) feats.append((bag, category)) #print(feats) #print len(filtered_tokens) num_files += 1 if num_files >= 1000: # you may want to de-comment this and the next line if you're doing tests (it just loads N documents instead of the whole collection so it runs faster break print(" Category %s, %i files read" % (category, num_files)) print(" Total, %i files read" % (len(feats))) return feats
def tokenize(self, text): """ :param text: str :rtype: list(str) :return: a tokenized list of strings; Normalizes URLs, usernames and word lengthening depending of the attributes of the instance. """ # Fix HTML character entities: text = _replace_html_entities(text) # Remove or replace username handles if self.strip_handles: text = remove_handles(text) elif self.normalize_usernames: text = normalize_mentions(text) if self.normalize_urls: # Shorten problematic sequences of characters text = normalize_urls(text) # Normalize word lengthening if self.reduce_len: text = HANG_RE.sub(r'\1\1\1', text) text = reduce_lengthening(text) # Tokenize: safe_text = HANG_RE.sub(r'\1\1\1', text) words = WORD_RE.findall(safe_text) # Possibly alter the case, but avoid changing emoticons like :D into :d: # lower words but keep words that are all upper cases if not self.preserve_case: words = [_lowerize(w, self.keep_allupper) for w in words] words = [_stock_code(w) for w in words] return words
def tokenize(self, text): """ :param text: str :rtype: list(str) :return: a tokenized list of strings; concatenating this list returns\ the original string if `preserve_case=False` """ # Fix HTML character entities: text = _replace_html_entities(text) # Remove username handles if self.strip_handles: text = remove_handles(text) # Normalize word lengthening if self.reduce_len: text = reduce_lengthening(text) # Shorten problematic sequences of characters safe_text = HANG_RE.sub(r"\1\1\1", text) # Tokenize: r"|<(?:[^\d>]+|:[A-Za-z0-9]+:)\w+>" custom_Re = regex.compile( r"""(%s)""" % "|".join( ( r":[^:\s]+:", r"<:[^:\s]+:[0-9]+>", r"<a:[^:\s]+:[0-9]+>", r"<(?:[^\d>]+|:[A-Za-z0-9]+:)\w+>", ) + REGEXPS ), regex.VERBOSE | regex.I | regex.UNICODE, ) words = custom_Re.findall(safe_text) # Possibly alter the case, but avoid changing emoticons like :D into :d: if not self.preserve_case: words = list( map((lambda x: x if EMOTICON_RE.search(x) else x.lower()), words) ) return words
def text_cleaner(text): use_GermanStemmer = False tokens = False # Remove username handles # -? do we need the user names text = remove_handles(text) # Remove punctuation marks text_blob = TextBlob(text) text = ' '.join(text_blob.words) # replace the umlauts # ============================================================================= # text = re.sub('ä', 'ae', text) # text = re.sub('ö', 'oe', text) # text = re.sub('ü', 'ue', text) # text = re.sub('Ä', 'Ae', text) # text = re.sub('Ö', 'Oe', text) # text = re.sub('Ü', 'Ue', text) # text = re.sub('ß', 'ss', text) # ============================================================================= # remove the numbers text = re.sub(r'[0-9]+', '', text) # Remove emojis german_char = " abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZäöüÄÖÜ" text = ''.join(c for c in text if c in german_char) tokenizer = TweetTokenizer(preserve_case=True, reduce_len=True) if tokens: return tokenizer.tokenize(text) elif use_GermanStemmer: stemmer = GermanStemmer() return [stemmer.stem(token) for token in tokenizer.tokenize(text)] else: return text
def word_bagger(tlist): # Take out twitter handles newdoc = [] for doc in tlist: newdoc.append(casual.remove_handles(doc)) # Initialize algo counter = CountVectorizer(ngram_range=(1, 2), stop_words=en_stop, min_df=4) # Fit the model counts = counter.fit_transform(newdoc).toarray() # Summarize counts vocab = counter.get_feature_names() dist = np.sum(counts, axis=0) word_counts = [] for tag, count in zip(vocab, dist): word_counts += [{'count': count, 'word': tag}] word_counts = pd.DataFrame.from_dict(word_counts) # Return a Pandas Dataframe return word_counts
def preprocess(tweet_text): return URL_RE.sub(' ', remove_handles(_replace_html_entities(tweet_text)))
def main(): stopwordsPunctuationList = stopwords.words('english') [stopwordsPunctuationList.append(i) for i in string.punctuation] stopwordsPunctuationList.append('not') categories = ['positive', 'negative'] trainmap = 'trainset' feats = read_files(trainmap, categories, stopwordsPunctuationList) #print(feats) high_info_words = high_information(feats, categories) high_info_feats = filter_high_information_words(feats, high_info_words) train_feats, test_feats = split_train_test(high_info_feats) all_words = [] for f in feats: for word in f[0].keys(): all_words.append(word) frequency = nltk.FreqDist(all_words) word_features = frequency.keys() #print(word_features) # https://www.youtube.com/watch?annotation_id=annotation_3385405775&feature=iv&index=12&list=PLQVvvaa0QuDf2JswnfiGkliBInZnIC4HL&src_vid=zi16nl82AMA&v=-vVskDsHcVc def find_features(document): words = set(document) features = {} for w in word_features: features[w] = (w in words) return features train_set = nltk.classify.apply_features(find_features, high_info_feats) nb_classifier = nltk.NaiveBayesClassifier.train(train_set) testmap = 'testset' #test_feats = read_files(testmap,categories,stopwordsPunctuationList) test_set = nltk.classify.apply_features(find_features, test_feats) accuracy_score = evaluation(nb_classifier, test_set, categories) analysis(nb_classifier) research_data = [ 'research_data/07-06-2018.csv', 'research_data/08-06-2018.csv', 'research_data/09-06-2018.csv', 'research_data/10-06-2018.csv', 'research_data/11-06-2018.csv', 'research_data/12-06-2018.csv', 'research_data/13-06-2018.csv', 'research_data/14-06-2018.csv', 'research_data/15-06-2018.csv', 'research_data/16-06-2018.csv', 'research_data/17-06-2018.csv', 'research_data/18-06-2018.csv', 'research_data/19-06-2018.csv', 'research_data/20-06-2018.csv', 'research_data/21-06-2018.csv', 'research_data/22-06-2018.csv', 'research_data/23-06-2018.csv' ] for file in research_data: positives, negatives = 0, 0 with open(file, 'r', encoding='UTF-8') as f: f_list = list(f) for line in f_list: line = line.strip() #str_list = list(filter(None,line)) line = line.strip('\n') line = remove_handles(line) line = re.sub(r"http\S+", "", line) line = re.sub(r'\d+', '', line) line = tokenizer.tokenize(line) line = ' '.join(line) line = line.lower() line = tknzr.tokenize(line) line = [ w for w in line if not w in stopwordsPunctuationList and w not in ['rt', 'btc', 'bitcoin'] ] line = ' '.join(line) result = nb_classifier.prob_classify( find_features(line.split())) if result.max( ) == 'positive' and result.prob('positive') >= .7: positives += 1 elif result.max( ) == 'negative' and result.prob('negative') >= .7: negatives += 1 #break print( 'Data: {} Positives: {} Negatives: {} Confidence Positive: {} Confidence Negative: {}' .format(file, positives, negatives, result.prob('positive'), result.prob('negative')))
def preprocess(lines, dataset, stem=True, tokenize=True, removeHandles=True, removeLinks=True, removeEmojis=True, removeSymbols=True, replaceSlang=True, verbose=True): # Input: Raw lines from the dataset file # Output: Preprocessed tweet (or list of tokens if tokenize = True) tweets = [] for i, line in enumerate(lines): if dataset == 'DisasterTweet': tweetElements = line.strip().split(",")[:-4] for i in range(len(tweetElements)): tweetElements[i] = tweetElements[i].strip() for k in tweetElements: if k == '': tweetElements.remove(k) tweet = ",".join(tweetElements) elif dataset == 'cf10k': tweet = line elif dataset == 'cr26': tweet = line else: raise Exception('No dataset selected') tweet = None if verbose: print('original:', tweet) if removeHandles: tweet = remove_handles(tweet) if verbose: print('removeHandles:', tweet) if removeLinks: tweet = remove_links(tweet) if verbose: print('removeLinks:', tweet) if stem: tweet = [ stemmer.stem(word) for word in tweet.split() if (word not in symbols) ] tweet = " ".join(tweet) if verbose: print('stem:', tweet) if tokenize: tweet = tk.tokenize(tweet) if verbose: print('tokenize', tweet) if removeEmojis: tweet = remove_emojis(tweet) if verbose: print('removeEmojis:', tweet) if removeSymbols: for w in tweet: if w in symbols: tweet.remove(w) if verbose: print('removeSymbols:', tweet) if replaceSlang: if type(tweet) == str: tweet = tk.tokenize(tweet) tweet = replace_slang(tweet) if verbose: print('replaceSlang:', tweet) # --- if verbose: print('\n') # --- tweets.append(tweet) return tweets