def tweet_clean(self, tweet): # Remove tickers sent_no_tickers = re.sub(r'\$\w*', '', tweet) tw_tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True) temp_tw_list = tw_tokenizer.tokenize(sent_no_tickers) # Remove stopwords list_no_stopwords = [ i for i in temp_tw_list if i.lower() not in self._cache_english_stopwords ] # Remove hyperlinks list_no_hyperlinks = [ re.sub(r'https?:\/\/.*\/\w*', '', i) for i in list_no_stopwords ] # Remove hashtags list_no_hashtags = [re.sub(r'#', '', i) for i in list_no_hyperlinks] # Remove Punctuation and split 's, 't, 've with a space for filter list_no_punctuation = [ re.sub(r'[' + string.punctuation + ']+', ' ', i) for i in list_no_hashtags ] # Remove multiple whitespace new_sent = ' '.join(list_no_punctuation) # Remove any words with 2 or fewer letters filtered_list = tw_tokenizer.tokenize(new_sent) list_filtered = [re.sub(r'^\w\w?$', '', i) for i in filtered_list] filtered_sent = ' '.join(list_filtered) cleaned_tweet = re.sub(r'\s\s+', ' ', filtered_sent) # Remove any whitespace at the front of the sentence cleaned_tweet = cleaned_tweet.lstrip(' ') return cleaned_tweet
def parse_data_iterator(vocab, filename, delimiter=",", steps=10): vocab.add_word('</s>') file = open(filename, 'r') reader = csv.reader( file, delimiter=delimiter, ) headers = next(reader) list_of_train = [] tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=False) for row in reader: curr = [] encoded = [] label = [row[1]] if (row[1] == 0): label.append(1) else: label.append(0) words = tokenizer.tokenize(" ".join(row[3:])) for i in range(steps): if i < len(words): try: words[i] = str(words[i]) except: words[i] = words[i] words[i] = canon_word(words[i]) vocab.add_word(words[i]) curr.append(words[i]) else: curr.append('</s>') for word in curr: encoded.append(vocab.encode(word)) yield label, curr
def data_processing(df): t = TweetTokenizer() emotions = [ 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'negative', 'positive', 'sadness', 'surprise', 'trust' ] df['translated_full_text'] = df['translated_full_text'].astype(str).apply( remove_links) df['cleaned_text'] = df['translated_full_text'].astype(str).apply( style_text) df['cleaned_text'] = df['cleaned_text'].astype(str).apply( lambda x: remove_words(x.split(), stopcorpus)) df['cleaned_text'] = df['cleaned_text'].apply(collapse_list_to_string) df['cleaned_text'] = df['cleaned_text'].astype(str).apply( remove_apostrophes) df['tokenized_sents'] = df.apply( lambda row: t.tokenize(row['cleaned_text']), axis=1) df['word_count'] = df.apply(lambda row: len(row['tokenized_sents']), axis=1) df = df[df.word_count > 0] df = text_emotion(df) for emotion in emotions: df[emotion] = df[emotion] / df['word_count'] date = datetime.datetime.strptime(df['created_at'].min(), '%Y-%m-%d %H:%M:%S').date() df.to_pickle(str(date) + ".pickle") df.to_excel(str(date) + ".xlsx") return
def preprocess_tweet(tweet): """ This function will preprocess the input tweet Steps for preprocessing: 1. Lowercase the letters 2. Replace the characters with frequency greater than 3 with 3 in a word 3. Replace a url with Tag: <URLURL> 4. Replace a tag mention: <UsernameMention> @TODO: 1. Look for better preprocessing methods on the web 2. Apply here """ clean_tweet = tp.clean(tweet) # perform lemmatization tokenizer = TweetTokenizer() tweet_tokens = tokenizer.tokenize(clean_tweet) lemmatized_tweet = lemmatize_tweet(tweet_tokens) # remove stopwords preprocessed_tweet = remove_stopwords(lemmatized_tweet) return preprocessed_tweet
def tokenize_comments(base_dir, comments_file,hashh=None): tkd_data = None if hashh: tkd_data = load_cached_data(hashh) if tkd_data is None: hash_f = get_cache_path(hashh) with open(hash_f, 'wb') as pkl_f: tkd_data = defaultdict(dict) tk = TweetTokenizer(preserve_case=True, reduce_len=False, strip_handles=False) for i, (root, dirs, files) in enumerate(os.walk(base_dir)): if comments_file in files: project = root.split('/')[-1] print('Processing %s, number %d' % (project, i)) posts = [] with open(os.path.join(root, comments_file), 'r') as inf: r = csv.DictReader(inf) for row in r: p = post(' '.join(list(tk.tokenize(row['body']))), row['login'], row['mention_login'], row['issue_num'], row['datetime'], project) posts.append(p) tkd_data[project] = posts pickle.dump(tkd_data, pkl_f) return tkd_data
class WordTokenizerNLTK(BaseWordTokenizer): """ uses an NLTK tokenizer to tokenize a sentence into words. When re-joining/de-tokenizing, this works best with JoinerNLTK. Why the "TweetTokenizer"? NLTK's recommended word tokenizer as of 3.2.4 is Treebank, and I started with that, and sustained for a while. But I couldn't get it to stop tokenizing contractions ("don't" -> "do n't"), even when it seemed like I configured it right. The tokenizing of contractions has its uses in NLP in general, but in our context here, it's not very useful, and presents big pains for testing. TweetTokenizer is a good compromise, and it's actually pretty general purpose, despite its tongue-in-cheek name. http://www.nltk.org/api/nltk.tokenize.html http://text-processing.com/demo/tokenize/ """ def __init__(self): super(WordTokenizerNLTK, self).__init__() self.strategy = TweetTokenizer(preserve_case=True, reduce_len=False, strip_handles=False) def tokenize(self, text): """ >>> WordTokenizerNLTK().tokenize("It's a beautiful day today...") [u"It's", u'a', u'beautiful', u'day', u'today', u'...'] >>> from presswork.text.clean import CleanInputString >>> WordTokenizerNLTK().tokenize(CleanInputString("Hello there!!!")) [u'Hello', u'there', u'!', u'!', u'!'] """ text = clean.CleanInputString(text).unwrap() return WordList(self.strategy.tokenize(text))
def get_train_test_data(find_and_concatenate_expressions=False): def remove_url(tokens): tokens = filter(lambda x: "http" not in x, tokens) return list(tokens) def remove_hashtags(tokens): tokens = map(lambda x: x.replace('#', ''), tokens) return list(tokens) db = pd.read_excel("Classeur1.xlsx", encoding="utf-8") dict_values = {'Not Relevant': -1, 'Relevant': 1, "Can't Decide": 0} db["to_predict"] = db.choose_one.map(dict_values) db = db[["text", "to_predict"]] twtk = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True) db["token_retreated_text"] = db["text"].apply( lambda x: remove_hashtags(remove_url(twtk.tokenize(x)))) db["retreated_text"] = db["token_retreated_text"].apply( lambda x: " ".join(x)) if find_and_concatenate_expressions: db["token_retreated_text"] = clean_corpus(db["retreated_text"]) db["retreated_text"] = db["token_retreated_text"].apply( lambda x: " ".join(x)) msk = np.random.rand(len(db)) < 0.8 train = db[msk] test = db[~msk] return train, test
class TweetVectorizer: _emoticon_mapping = { ":-)": "<smileface>", ":)": "<smileface>", ":D": "<lolface>", ":-D": "<lolface>", ":|": "<neutralface>", ":-(": "<sadface>", ":(": "<sadface>", } def __init__( self, token_mapping: Mapping[str, int], preserve_case: bool = False, ): self._token_mapping = token_mapping self._tokenizer = TweetTokenizer(preserve_case=preserve_case) def __call__(self, text: str) -> List[int]: token_mapping = self._token_mapping return [token_mapping[token] for token in self.tokenize(text)] def tokenize(self, text: str) -> List[str]: return self._tokenizer.tokenize(self.preprocess(text)) def preprocess(self, text: str) -> str: get_emoticon = self._emoticon_mapping.get text = URLS_RE.sub("<url>", text) text = USERNAMES_RE.sub("<user>", text) text = HASHTAGS_RE.sub("<hashtag>", text) text = NUMBERS_RE.sub("<number>", text) text = EMOTICON_RE.sub(lambda m: get_emoticon(m.group()) or m.group(), text) return text
def nltk_tweet_tokenizer(s, **tokenizer_kwargs): """NTLK TweetTokenizer""" kwargs = dict(strip_handles=False, reduce_len=True) kwargs.update(**tokenizer_kwargs) tokenizer = TweetTokenizer(**kwargs) token_list = tokenizer.tokenize(s) return token_list
def tokenize(sents: list): tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True) sents_tok = [] for sent in sents: tokens = [token for token in tokenizer.tokenize(sent) if not token.startswith('http')] sents_tok.append(' '.join(tokens)) return sents_tok
def tokenize(self, text): #Make a list where each word is an element, text_list = text.split(' ') #Lemmatize each word. Exception: We want "better" to become its lemma "good" but "best" should stay "best". #There are nltk methods for this. Look at https://www.youtube.com/watch?v=uoHVztKY6S4 #Remove the articles 'a', 'an', 'the' #Also split on punctuation marks so that, "I like, fish" becomes ['I', 'like', ',', 'fish'] = token_list tweettokenizer = TweetTokenizer(); lemmatizer = WordNetLemmatizer(); token_list = tweettokenizer.tokenize(text) try: token_list.remove('a'); token_list.remove('an'); token_list.remove('the'); except ValueError: pass pos_list = pos_tag(token_list) pos_listwordnet = [(word[0], self.get_wordnet_pos(word[1])) for word in pos_list] for i in range(len(token_list)): token_list[i] = lemmatizer.lemmatize(token_list[i] ,pos=pos_listwordnet[i][1]) if len(token_list) == 1: token_list.append('.') return token_list
class WordTextTransformer: def __init__(self, source_vocabulary, target_vocabulary, max_source_length, max_target_length): self.source_vocabulary = source_vocabulary self.target_vocabulary = target_vocabulary self.max_source_length = max_source_length self.max_target_length = max_target_length self.tokenizer = TweetTokenizer() def tokenize(self, text): return self.tokenizer.tokenize(text) def transform_xi(self, text): xi = self.tokenize(text) xi = self.source_vocabulary.encode(xi) xi = pad_sequences([xi], maxlen=self.max_source_length, padding="post")[0] return np.array(xi) def transform_yi(self, text): yi = self.tokenize(text) yi = self.target_vocabulary.encode(yi) yi = pad_sequences([yi], maxlen=self.max_target_length, padding="post") yi = to_categorical(yi, num_classes=len(self.target_vocabulary)) return np.array(yi)
class TweetSentimentAnalyzer: def __init__(self): self.tokenizer = TweetTokenizer(reduce_len=True, strip_handles=True) self.analyzer = SentimentIntensityAnalyzer() def get_sentiment(self, text): cleaned_tweet = ' '.join(self.tokenizer.tokenize(text)) return self.analyzer.polarity_scores(cleaned_tweet)['compound']
def tokenize_tweets(file): tweet_tokenizer = TweetTokenizer() text = reader.raw(file) link_free = remove_links(text) emoji_free = remove_emoji(link_free) user_free = remove_users(emoji_free) number_free = remove_numebrs(user_free) hashtag_free = remove_hashtags(number_free) twitter_words = [ term.lower() for term in tweet_tokenizer.tokenize(hashtag_free) if term.lower() not in stop_words ] twitter_words_with_hashtags = [ term.lower() for term in tweet_tokenizer.tokenize(number_free) if term.lower() not in stop_words ] return twitter_words, twitter_words_with_hashtags
def tokenize(s): sentence_splitter = TweetTokenizer() tokens = sentence_splitter.tokenize(s) result = [] for word in tokens: result.append(unicodedata.normalize('NFKD', word).encode('ascii', 'ignore')) return result
def tokeniza(chars, keyword=None): """ Tokenize a string (duplicates keywords if any) """ tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) tokens = tokenizer.tokenize(chars) return tokens
def preprocess(comments, preprocessors): tokenizer = TweetTokenizer() html_cleaner = re.compile('<.+?>') for comment in comments: comment = html_cleaner.sub('', comment) tokenized_comment = tokenizer.tokenize(comment) for preprocessor in preprocessors: tokenized_comment = preprocessor.optimize(tokenized_comment) yield tokenized_comment
def tokenize(s): sentence_splitter = TweetTokenizer() tokens = sentence_splitter.tokenize(s) result = [] for word in tokens: result.append( unicodedata.normalize('NFKD', word).encode('ascii', 'ignore')) return result
def tokenize(text): tweet_tokenizer = TweetTokenizer() # 1. Tokenize text = tweet_tokenizer.tokenize(text) # 2. Cleaning # Punctuation text = [t for t in text if t not in string.punctuation] # Normalisieren text = [t.lower() for t in text] return text
def preprocess(text): tokenizer = TweetTokenizer() # Remove stopwords. tokens = tokenizer.tokenize(text) tokens = [ token for token in tokens if token not in ENGLISH_STOPWORDS and token.isalpha() ] return tokens
def modify_abbrev(tweet): tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) tokens = tokenizer.tokenize(tweet) for w in tokens: w = abbreviations[ w.lower()] if w.lower() in abbreviations.keys() else w text = ' '.join(tokens) return text
def build_extractors_datasets(train_set): """ Transform the training set from the original form nltk.Tree organized by entity_type {entity_type: [tree1, tree2, ...]} """ tokenizer = TweetTokenizer() datasets = defaultdict(list) for _, samples in train_set.items(): for sample in samples: words = tokenizer.tokenize(sample[0]) tagged = nltk.pos_tag(words) for entity_type, entity in sample[1].items(): entity_words = tokenizer.tokenize(entity) tree = mark_entities(tagged, entity_words, entity_type) datasets[entity_type].append(tree) return datasets
def clear_data(tweet): tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True) tokens = tokenizer.tokenize(tweet) clean_tweet = tokens \ | remove_urls \ | process_hashtags \ | remove_stopwords \ | remove_numbers \ | remove_multiple_occurrence return ' '.join(clean_tweet)
class votesmartParser: # initialization def __init__(self, raw_path, save_path): self.raw_path = raw_path self.save_path = save_path self.wnl = WordNetLemmatizer() self.tknzr = TweetTokenizer(preserve_case=False, reduce_len=True) # write one record def write(self, tokens): self.bif = open(os.path.join(save_path, "bigrams.tsv"), "a") for i in range(len(tokens) - 1): token = tokens[i] + "_" + tokens[i + 1] if self.party == "rep": self.bif.write(token + "\t1\t0\n") if self.party == "dem": self.bif.write(token + "\t0\t1\n") self.unif = open(os.path.join(save_path, "unigrams.tsv"), "a") for i in range(len(tokens)): token = tokens[i] if self.party == "rep": self.unif.write(token + "\t1\t0\n") if self.party == "dem": self.unif.write(token + "\t0\t1\n") # parse records one by one def parse(self, tmp_f): soup = BeautifulSoup(tmp_f, "html.parser") for div in soup.find_all("div"): if div.get("itemprop"): if "articleBody" in div.get("itemprop"): text = div.get_text().replace(r"\n", "\n").replace( r"\'", "'").replace("-", "") text = re.sub("Source:", "", text) text = re.sub(r"http\S+", "", text) text = re.sub(r"[^\w\s]", "", text) tokens = self.tknzr.tokenize(text) # lower and tokenize tokens = [ self.wnl.lemmatize(token, "n") for token in tokens ] # lemmatization noun tokens = [ self.wnl.lemmatize(token, "v") for token in tokens ] # lemmatization verb tokens = [token for token in tokens if token not in stops] self.write(tokens) # traverse all raw responses def traverse(self): for path_base, _, path_postfixes in os.walk(raw_path): for path_postfix in path_postfixes: print(path_postfix) self.party = path_base.split("/")[-1].split("_")[1] with open(os.path.join(path_base, path_postfix), "r") as tmp_f: self.parse(tmp_f)
def process_tweet_text(tweet): if tweet.startswith('@null'): return "[Tweet not available]" tweet = re.sub(r'\$\w*','',tweet) # Remove tickers tweet = re.sub(r'https?:\/\/.*\/\w*','',tweet) # Remove hyperlinks tweet = re.sub(r'['+string.punctuation+']+', ' ',tweet) # Remove puncutations like 's twtok = TweetTokenizer(strip_handles=True, reduce_len=True) tokens = twtok.tokenize(tweet) tokens = [i.lower() for i in tokens if i not in stopwords and len(i) > 2 and i in english_vocab] return tokens
def normalize_messages(messages): tokenizer = TweetTokenizer(preserve_case=False) normalized_messages = [] for message in messages: try: tokens = tokenizer.tokenize(message) text = [word.lower() for word in Text(tokens)] if text: normalized_messages.append(text) except TypeError: pass return normalized_messages
class TweetProcessor: def __init__(self, data_loader): self.data = data_loader self.tokenizer = TweetTokenizer() self.stemmer = PorterStemmer() self.stopwords = stopwords.words('english') self.re_url = r'http\S+' self.punctuation = string.punctuation self.vocab = defaultdict(set) def __iter__(self): yield from self.process_tweet() def process_tweet(self): for tokens in self.token_generator(): processed_tweet = [] for token in tokens: processed_token, tag = self.process_token(token) if processed_token: processed_tweet.append((processed_token, tag)) if processed_tweet: yield processed_tweet def token_generator(self): for text in self.data.corpus['text']: yield self.tokenizer.tokenize(text) def process_token(self, token): original = token if re.match(self.re_url, token): url = self.data.expanded_urls.get(token, token) return TweetProcessor.clean_url(url), 'URL' token = token.lower() if token in self.stopwords or token in self.punctuation: return None, None if token.startswith('@'): return None, None token = token.translate({ord(k): "" for k in self.punctuation}) #token = self.stemmer.stem(token) self.vocab[token].add(original) return token, self.data.token_tags.get(original, "NA") @staticmethod def clean_url(url): spl = urlsplit(url) spl = urlsplit(spl.geturl()) return urlunsplit((spl[0], spl[1], spl[2], '', ''))
def create_matrix(tweets: List, name: str = 'oscar pistorius') -> csr_matrix: matrix_loc = Path('data', name, 'tf_idf_matrix.pickle') if matrix_loc.exists(): logger.info("Matrix exists! loading...") with matrix_loc.open('rb') as f: matrix = pickle.loads(f.read()) return matrix stemmer = PorterStemmer() tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True) texts = [] for tweet in tqdm(tweets, desc="(create_matrix) iterating over tweets..."): text = tweet.text tokens = tokenizer.tokenize(text) text_proc = [] for token in tokens: token = token.strip() if len(token) < 3: continue elif token in stopwords.words('english'): continue elif nlp_utils.match_url(token): continue elif token in string.punctuation: continue # elif token.startswith(("#", "$")): # continue token = token.translate({ord(k): "" for k in string.punctuation}) token = stemmer.stem(token) token = token.strip() if token == "": continue text_proc.append(token) texts.append(text_proc) vectorizer = TfidfVectorizer(analyzer="word", tokenizer=lambda x: x, lowercase=False) m = vectorizer.fit_transform(texts) logger.info("Saving computed matrix...") with matrix_loc.open('wb') as f: f.write(pickle.dumps(m)) return m
def tokenize_text(text): """ Transforms the specified files in tokens using the Twitter tokenizer. @params: str Input text to tokenize @returns: list(str) Returns the tokens as a list of strings. """ tokenizer = TweetTokenizer() # tokenizing the text tokens = tokenizer.tokenize(text) words = [w.lower() for w in tokens] return words
def compare(topicsFileName, headlinesFileName): """ This function compares a set of detected trending topics to a list of headlines in the JSON format provided by NewsAPI. A detected trending topic is considered as matching a headline if the intersection of the headline and the topic is at least 40%. It returns the list of trending topics that are included in the provided headlines, as well as: recall: number of matching topics divided by number of headlines precision: average fraction of headline terms found per matching topic """ # load topics from file topics = [] with open(topicsFileName, 'r', encoding='utf-8') as tf: topics = [json.loads(line) for line in tf] # load headlines from file headlines = [] with open(headlinesFileName, 'r', encoding='utf-8') as hf: headlines = [json.loads(line) for line in hf] # prepare stemmer and tokenizer stemmer = PorterStemmer(mode=PorterStemmer.MARTIN_EXTENSIONS) tokenizer = TweetTokenizer() # compare every topic with every headline matchingTopics = [] for tIter, topic in enumerate(topics): print('\r', tIter + 1, len(topics), end='', file=sys.stderr) for headline in headlines: # split headline title (rather than description) into stemmed terms if 'title' not in headline or headline['title'] is None or len( headline['title']) == 0: continue usedText = headline['title'] headlineTerms = [ stemmer.stem(term) for term in tokenizer.tokenize(usedText) if term not in stopwords.stopwords + stopwords.moreStopwords ] # check for inclusion of topic in headline if len(set(topic['terms'].keys()) & set(headlineTerms) ) >= 0.4 * min(len(set(headlineTerms)), len(set(topic['terms'].keys()))): matchingTopics.append(topic) break print(file=sys.stderr) precision = len(matchingTopics) / len(topics) recall = len(matchingTopics) / len(headlines) return matchingTopics, recall, precision
def preprocess(text, sentiments, w2i, maxlen, shuffle=True): tokenizer = TweetTokenizer() reviews = [] for t in text: tokens = list(tokenizer.tokenize(t)) token_idx = convert_str_to_idx(tokens, w2i, maxlen) reviews.append(token_idx) txt, sents = torch.LongTensor(reviews), torch.FloatTensor(sentiments) if shuffle: txt, sents = _shuffle(txt, sents) return txt, sents.unsqueeze(1)