def custom_extract_features(data): """ Implement your own feature extraction function here. The function should modify data by adding a binary np.array ex["FEATURES"] to each ex in data. """ # Replace this with your own code. tokenizer = nltk.TweetTokenizer(preserve_case=False, reduce_len=True) stemmer = PorterStemmer() sw = {w: True for w in set(stopwords.words('english'))} get_worst = lambda x: max([e.neg_score() for e in x]) get_best = lambda x: max([e.pos_score() for e in x]) for data_set in data.values(): for ex in data_set: # Words that are clearly loaded one way # ex['BODY'] = [(word) for word in tokenizer.tokenize(ex['BODY']) if len(list(swn.senti_synsets(word))) == 0 or np.abs(get_best(list(swn.senti_synsets(word))) - get_worst(list(swn.senti_synsets(word)))) > 0.25] # Preserve words not in the sentinet i.e. likely smileys, abbreviations etc and only keep the words that have at least some sentiment score ex['BODY'] = [ (word) for word in tokenizer.tokenize(ex['BODY']) if len(list(swn.senti_synsets(word))) == 0 or ( get_best(list(swn.senti_synsets(word))) > 0.1 or get_worst(list(swn.senti_synsets(word))) > 0.1) ] # Filter stopwords # ex['BODY'] = [(word) for word in (tokenizer.tokenize(ex['BODY'])) if word not in sw] # ex['BODY'] = [(word) for word in (tokenizer.tokenize(ex['BODY']))] extract_features(data)
def review_to_wordlist(review_text): review_text = emoji_to_text(review_text) review_text = abbreviation_to_text(review_text) review_text = re.sub("(@[\w]*\ )+", "@USER", review_text) duplicateSpacePattern = re.compile(r'\ +') review_text = re.sub(duplicateSpacePattern, ' ', review_text).strip() # review_text = re.sub("@[\w]*\ ", " ", review_text) # review_text = re.sub("(@[\w]*\ )+", "@USER ", review_text).strip() #将重复出现的@USER替换成只有一个的@USER # print(review_text) # review_text = re.sub("[!?,.]", " ", review_text).strip() review_text = ekphrasis_config(review_text) review_text = re.sub("[^a-zA-Z0-9\@\&\:]", " ", str(review_text)) # review_text = review_text.lower() # print(review_text) # words = stanford_tokenizer(review_text) # words = nltk.word_tokenize(review_text) # words = nltk.TweetTokenizer().tokenize(review_text) words = nltk.TweetTokenizer(strip_handles=True, reduce_len=True).tokenize(review_text) # return (review_text) return (words)
def load_4_aligning( sent_path, max_len = 64, max_sent = 200000, ): ''' This function emulates load_align_corpus from utils.FTalignment.py To use when wanting to encode english sentences with different models, instead of encoding different language sentences with the same model ''' import nltk sentences_1 = [] bad_idx = [] with open(sent_path) as sent_file: for i, line in enumerate(sent_file): if i >= max_sent: break sent_1 = nltk.TweetTokenizer().tokenize(line) if len(sent_1) > max_len: bad_idx.append(i) else: sentences_1.append(sent_1) alignments = [ np.array([ [j,j] for j,_ in enumerate(sent) ]) for sent in sentences_1 ] return sentences_1, sentences_1, alignments
def read_data(data): text = open(data).read() # clean up data tokenizer = nltk.TweetTokenizer() # separate sentences tokens = [tokenizer.tokenize(t) for t in nltk.sent_tokenize(text)] # remove the text preceding the class name of each sentence for i in range(len(tokens)): if 'pos' in tokens[i]: while tokens[i][0] != 'pos': tokens[i].remove(tokens[i][0]) elif 'neg' in tokens[i]: while tokens[i][0] != 'neg': tokens[i].remove(tokens[i][0]) # get words and vocabulary vocab = nltk.word_tokenize(text) vocab = list(set(w.lower() for w in vocab)) # remove the class names from vocabulary if 'pos' in vocab: vocab.remove('pos') elif 'neg' in vocab: vocab.remove('neg') return tokens, vocab
def strip_handles(positive_sentences, negative_sentences, all_words, positive_words, negative_words): # Remove the handles, reduce the length and put to lowercase tknzr = nltk.TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case=False) # get the number of positive sentences pos_num_sentences = len(positive_sentences) num_neg = 0 # use the tokenizer on each word for sentences in positive_sentences: tknzr_list = tknzr.tokenize(sentences) # add word to its dictionary split_into_words(tknzr_list, all_words, positive_words) for sentences in negative_sentences: tknzr_list = tknzr.tokenize(sentences) split_into_words(tknzr_list, all_words, negative_words) # when the number of negative sentences processed is the same as the positive stop if num_neg == pos_num_sentences: break else: num_neg += 1 print("stripped handles and reduced length") print("Removed Stopwords") print("Used lemmatizer") print("Converted emoticons to text")
def tokenizer1(tweet): """ splits tweet into initial list of tokens """ tknzr = nltk.TweetTokenizer(preserve_case=True, strip_handles=False) token_list = tknzr.tokenize(tweet) return token_list
def tokenize(tweets): tokenizer = nltk.TweetTokenizer() tweets_tokenized = [] for text in tweets: cleaned = clean_text(text) tweets_tokenized.append(tokenizer.tokenize(cleaned)) return tweets_tokenized
def tokenize(tweet): """ tokenize the tweet splits the tweet in lowercase words, i.e. tokens returns a set of tokens """ tokenizer = nltk.TweetTokenizer() return [word.lower() for word in tokenizer.tokenize(tweet)]
def tokSentence(filename): file = open(filename) text = file.readlines() file.close() tok = nk.TweetTokenizer(reduce_len=True, strip_handles=False) print 'tokenize tweets---->', tokSentences = [tok.tokenize(i) for i in text] print 'finish' return tokSentences
def __init__(self): self.tweet_tokenizer = nltk.TweetTokenizer(preserve_case=False) self.stop_words = set(stopwords.words('english')) #self.stemmer = nltk.SnowballStemmer('english') self.stemmer = nltk.PorterStemmer() self.custom_stopwords = [ 'that', 'for', 'in', 'this', 'is', 'of', 'to', 'it', 'and', 'the' ]
def load_tweets_to_df(valid_labels=[0, 1], tweets_dir=config.TRAINING_TWEETS_PATH, labels_dir=config.TRAINING_LABELS_PATH, valid_users=None): tknzr_pos_tagging = nltk.TweetTokenizer(preserve_case=True, reduce_len=True) tweet_files = glob.glob(os.path.join(tweets_dir, '*.*')) with open(labels_dir, 'r') as f: reader = csv.reader(f) next(reader, None) users = {row[0]: {'age': row[1], 'num_tweets': row[2], 'gender': row[3], 'condition': row[4]} for row in reader} X = [] D = [] Y = [] Users = [] NLTK_PTAGS = [] CMU_PTAGS = [] tweets_to_tag = [] i = 0 for file in tweet_files: username = os.path.splitext(os.path.basename(file))[0] if valid_users is not None and username not in valid_users: continue print(username) label = config.LABEL_IDS[users[username]['condition']] if label not in valid_labels: continue tweet_file = open(file, 'r') tweets = [] dates = [] nltk_pos_tags = [] cmu_pos_tags = [] for line in tweet_file: tweet = json.loads(line) if not is_retweeted_tweet(tweet): t = preprocess(tweet['text'], preserve_case=True) tweets.append(t) dates.append(datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y')) nltk_pos_tags.append(correct_emoji_pos_tags(nltk.pos_tag(tknzr_pos_tagging.tokenize(tweet['text'])))) cmu_pos_tags.append(i) tweets_to_tag.append(clean_for_cmu_tagging(tweet['text'])) i += 1 tweet_file.close() D.append(np.array(dates)) X.append(np.array(tweets)) Y.append(label) Users.append(username) NLTK_PTAGS.append(nltk_pos_tags) CMU_PTAGS.append(cmu_pos_tags) PTAGS_NEW = [] tagged = CMUTweetTagger.runtagger_parse(tweets_to_tag) for record in CMU_PTAGS: PTAGS_NEW.append([correct_emoji_pos_tags(tagged[r]) for r in record]) CMU_PTAGS = PTAGS_NEW df = pd.DataFrame(data=np.vstack([X, CMU_PTAGS, NLTK_PTAGS, D, Y]).transpose(), index=Users, columns=['tweets', 'cmu_pos_tags', 'nltk_pos_tags', 'created_at', 'labels']) return df
def get_corpus(data): # [input]: dataframe [['text','reply']] # [output]: corpus: sentence list data_text = [sent.lower() for sent in data['text'].values.tolist()] data_reply = [sent.lower() for sent in data['reply'].values.tolist()] tw_tokenizer = nltk.TweetTokenizer() corpus_text = [tw_tokenizer.tokenize(sentence) for sentence in data_text] corpus_reply = [tw_tokenizer.tokenize(sentence) for sentence in data_reply] return (corpus_text, corpus_reply)
def is_user_mention(self): """ checks if token is a user mention """ temp = nltk.TweetTokenizer(strip_handles=True) result = temp.tokenize(self.token) if result == []: return True else: return False
def preprocess(text): text = clean_tweet(text) tknzr = nltk.TweetTokenizer() text = tknzr.tokenize(text) ignore_words = set(stopwords.words('english')) # lowercase, remove words less than len 2 & remove numbers in tokenized list return [ word.lower() for word in text if len(word) > 2 and not word.isdigit() and not word in ignore_words ]
def __init__(self, textLabel, predictor, predictionThermo, confidenceThermo, dayLabel, hourLabel): QtWidgets.QTextEdit.__init__(self) self.textLabel = textLabel self.predictor = predictor self.predictionThermo = predictionThermo self.confidenceThermo = confidenceThermo self.dayLabel = dayLabel self.hourLabel = hourLabel self.tweet_tokenizer = nltk.TweetTokenizer() self.stemmer = nltk.wordnet.WordNetLemmatizer()
def _regenerate_dictionaries(self, statistics=False) -> None: """Regenerates used n-grams, tfidf everytime data change. This can occur either when the training size is changed or a new training set is obtained.""" # TF-IDF tknz = nltk.TweetTokenizer() self.tfidf \ = TfidfVectorizer(tokenizer=tknz.tokenize, max_features=self.max_tfidf) # get_raw_data returns tuple of asked attributes (that is (text,)) self.tfidf.fit( list( map(lambda a: a[0], self.get_raw_data(SampleTypeEnum.TRAIN, 'text')))) if statistics: self.print( f'Number of unique TF-IDF words: {len(self.tfidf.get_feature_names())}' ) # n-grams - mutual information vectorizer: CountVectorizer = CountVectorizer(tokenizer=tknz.tokenize) # get_raw_data returns tuple of asked attributes (that is (text,)) word_matrix \ = vectorizer.fit_transform(list(map(lambda i: i[0], self.get_raw_data(SampleTypeEnum.TRAIN, 'text')))) labels: List[str] \ = list(map(lambda a: a[0], self.get_raw_data(SampleTypeEnum.TRAIN, 'classification'))) mi = mutual_info_classif(word_matrix, labels) top_mi = top_n_indexes(mi, self.max_ngrams) ngrams = vectorizer.get_feature_names() self.used_ngrams = set(map(lambda i: ngrams[i], top_mi)) if statistics: self.print(f'Number of unique unigrams: {len(self.used_ngrams)}') # geneea entities # convert lists of entities into set and then join them into one set self.used_entities \ = reduce(lambda a, b: a.union(b), map(lambda i: set(i[0]), self.get_raw_data(SampleTypeEnum.TRAIN, 'entities'))) if statistics: self.print(f'Number of unique entities: {len(self.used_entities)}') if statistics: train = self.get_raw_data(SampleTypeEnum.TRAIN, 'classification') test = self.get_raw_data(SampleTypeEnum.TEST, 'classification') counts = Counter(train) + Counter(test) self.print(counts)
def lemmatize_text(sentence): # tokenize tweet tokenized_text = nltk.TweetTokenizer().tokenize(sentence) # verb lemmatization lemmatizer = nltk.WordNetLemmatizer() filtered_tokens = [] for word in tokenized_text: token = lemmatizer.lemmatize(word) filtered_tokens.append(token) filtered_text = ' '.join(filtered_tokens) return filtered_text
def extract_features(self): vector = CountVectorizer( min_df=2, tokenizer=nltk.TweetTokenizer(False).tokenize, encoding='ISO-8859-1', stop_words=nltk.corpus.stopwords.words('english')) train_counts = vector.fit_transform(self.train.data) test_counts = vector.transform(self.test.data) tfidf_transformer = TfidfTransformer() train_tfidf = tfidf_transformer.fit_transform(train_counts) test_tfidf = tfidf_transformer.transform(test_counts) return train_counts, test_counts
def __init__(self, redis_client: redis.client.Redis, local: bool = False): self.redis_client = redis_client self.redis_table = "cryptos" self.local = local if self.local: self.crypto_db = {} self.loaded = False self.check_and_load_cryptos() self.ttokenizer = nltk.TweetTokenizer() # tt = tweet tokenizer self.stemmer = nltk.PorterStemmer() # porter stemmer
def tweet_tokenize(tweet): tweet = tweet["text"] tokenizer = nltk.TweetTokenizer(strip_handles=False, reduce_len=True) tokens = tokenizer.tokenize(tweet) ret = [] for tok in tokens: if tok[0] == "@": ret.append("@user") elif tok[0] == "#": ret.append(tok[1:]) else: ret.append(tok) return ret
def get_tokens(self): tokenizer = nltk.TweetTokenizer() tokens = [] for p in self.content: raw_tokens = self.clean_tokens(tokenizer.tokenize(p)) while raw_tokens.count('.') != 0: bound = raw_tokens.index('.') sentence = raw_tokens[:bound] raw_tokens = raw_tokens[bound + 1:] tokens.append(sentence) if len(raw_tokens) != 0: tokens.append(raw_tokens) return tokens[:]
def process_tokens(tweet): """ Create the tokens and remove the stop words """ stop_words = set([ 'the', 'to', 'in', 'on', 'and', 'of', 'a', 'for', 'at', 'with', 'be', 'it', 'that', '-', 'this' ]) tknzr = nltk.TweetTokenizer(strip_handles=True) tokens = tknzr.tokenize(tweet) return [token for token in tokens if token not in stop_words]
def clean_tweets(self, doc): """ remove punctuation and stopwords """ tknzr = nltk.TweetTokenizer() if self.kwargs.get('vader'): words = ' '.join([word for word in tknzr.tokenize(doc['text']) if word.isalpha()]) return words else: words = ' '.join([word for word in tknzr.tokenize(doc) if word.isalpha() and word not in self.stopwords and len(word) > 2]) self.words.append(words) if self.kwargs.get('debug'): print(words) return words
def initialize(): """ Performs preprocessing on the wine_data table""" global tokens # this is jank, I know. I'll fix this later but it works for now tokens = [] for d in reviews: #Makes each word in a review lowercase d = d.lower() # Breaks each review down into its individual terms and stores the # words in a list where each word is an individual entry # EXAMPLE: "hello world" ==> ["hello", "world"] tokens.append(nltk.TweetTokenizer().tokenize(d)) tokens = del_stop_word(tokens) tokens = del_punc(tokens) tokens = porter_stem(tokens)
def tokenize(text): text = text.lower() # face toate literele mici text = re.sub('[^A-Za-z]', ' ', text) # scoate caracterele nonalfabetice """stops = set(stopwords.words("italian")) text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text) filtered_words = [word for word in text.split() if word not in stops] filtered_words = gensim.corpora.textcorpus.remove_short(filtered_words, minsize=3) text = " ".join(filtered_words) text = gensim.parsing.preprocessing.strip_punctuation2(text) text = gensim.parsing.preprocessing.strip_numeric(text) text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text) """ return nltk.TweetTokenizer(reduce_len=True, strip_handles=True).tokenize(text)
def init_word_tokenizers(main, lang, word_tokenizer = 'default'): if lang not in main.settings_global['word_tokenizers']: lang = 'other' if word_tokenizer == 'default': word_tokenizer = main.settings_custom['word_tokenization']['word_tokenizers'][lang] # NLTK if word_tokenizer.startswith('nltk_'): if word_tokenizer == 'nltk_nist': if 'nltk_nist_tokenizer' not in main.__dict__: main.nltk_nist_tokenizer = nltk.tokenize.nist.NISTTokenizer() elif word_tokenizer == 'nltk_nltk': if 'nltk_nltk_tokenizer' not in main.__dict__: main.nltk_nltk_tokenizer = nltk.NLTKWordTokenizer() elif word_tokenizer == 'nltk_penn_treebank': if 'nltk_treebank_tokenizer' not in main.__dict__: main.nltk_treebank_tokenizer = nltk.TreebankWordTokenizer() elif word_tokenizer == 'nltk_tok_tok': if 'nltk_toktok_tokenizer' not in main.__dict__: main.nltk_toktok_tokenizer = nltk.ToktokTokenizer() elif word_tokenizer == 'nltk_twitter': if 'nltk_tweet_tokenizer' not in main.__dict__: main.nltk_tweet_tokenizer = nltk.TweetTokenizer() # Sacremoses elif word_tokenizer == 'sacremoses_moses': lang_sacremoses = wl_conversion.remove_lang_code_suffixes(main, wl_conversion.to_iso_639_1(main, lang)) lang = wl_conversion.remove_lang_code_suffixes(main, lang) if f'sacremoses_moses_tokenizer_{lang}' not in main.__dict__: main.__dict__[f'sacremoses_moses_tokenizer_{lang}'] = sacremoses.MosesTokenizer(lang = lang_sacremoses) # spaCy elif word_tokenizer.startswith('spacy_'): init_spacy_models(main, lang) # Chinese elif word_tokenizer == 'pkuseg_zho': if 'pkuseg_word_tokenizer' not in main.__dict__: main.pkuseg_word_tokenizer = pkuseg.pkuseg() # Chinese & Japanese elif word_tokenizer.startswith('wordless_'): init_spacy_models(main, 'eng_us') init_spacy_models(main, 'other') # Japanese elif word_tokenizer.startswith('sudachipy_jpn'): if 'sudachipy_word_tokenizer' not in main.__dict__: main.sudachipy_word_tokenizer = sudachipy.Dictionary().create() # Tibetan elif word_tokenizer == 'botok_bod': if 'botok_word_tokenizer' not in main.__dict__: main.botok_word_tokenizer = botok.WordTokenizer()
def tokenize(text): '''Generic wrapper around different tokenization methods. ''' text = text.lower() # lowercase all text text = re.sub(r'@[A-Z0-9a-z_:!@#$%^&()=+,.></?;|@#]+', 'user', text) # replace users with "user" text = text.replace("#", "") # delete hashtags text = re.sub('https?://[A-Za-z0-9./#]+', 'link', text) # replace links with "link" text = re.sub(r'\d+', '', text) # remove numbers text = text.strip() # remove leading and ending spaces res = "" stop_words = set(stopwords.words('italian')) text = text.split() for word in text: cuv = word for stop_word in stop_words: if word == stop_word or len( word ) < 4: # deleting most common words in italian and words that have less than 4 characters cuv = "" res += cuv + " " res = res[:-1] result = res if len(result) > 2: if result[0] == " ": result = result[1:] result = ''.join(result) ''' for cuvant in result: if len(cuvant) < 4: result = result.replace(cuvant, "") stemming = ''.join(result) stemming = [stemmer.stem(k) for k in result] stemming = ' '.join(stemming) stemming = sp(stem) lemma = [] for cuvant in stemming: lemma.append(cuvant.lemma_) result = ' '.join(lemma) ''' # return nltk.WordPunctTokenizer().tokenize(result) return nltk.TweetTokenizer(strip_handles=True, reduce_len=True).tokenize(result)
def analyze(self, text): """Analyze text for sentiment, returning its score.""" self.score = 0 tokenizer = nltk.TweetTokenizer(strip_handles=True, reduce_len=True) tweettokens = [ i.lower() for i in tokenizer.tokenize(text) if len(i) > 2 ] for token in tweettokens: if token in self.positives: self.score = self.score + 1 elif token in self.negatives: self.score = self.score - 1 #print(tweettokens, "\t score = ",self.score) # TODO return self.score
def tokenize(term_vector): term_tokens = [ ] for d in term_vector: #Makes each word in a review lowercase d = d[0].lower() # Breaks each review down into its individual terms and stores the # words in a list where each word is an individual entry # EXAMPLE: "hello world" ==> ["hello", "world"] term_tokens.append( nltk.TweetTokenizer().tokenize( d ) ) return term_tokens
def __init__(self, dist_file_path=None): """ Initialize module with default data/english.dist file """ if dist_file_path is None: dist_file_path = os.path.join( os.path.dirname(os.path.abspath(__file__)), "data/english.dist") with open(dist_file_path, "rb") as distributions_file: pickle_dict = pickle.load(distributions_file) self.uni_dist = pickle_dict["uni_dist"] self.backward_bi_dist = pickle_dict["backward_bi_dist"] self.forward_bi_dist = pickle_dict["forward_bi_dist"] self.trigram_dist = pickle_dict["trigram_dist"] self.word_casing_lookup = pickle_dict["word_casing_lookup"] self.tknzr = nltk.TweetTokenizer()