def to_iob(text, target, label, tokenizer=TweetTokenizer()): text = tokenizer.tokenize(text) target = target.lower().split() found = False res = list() i = 0 while i < len(text): if found or target[0] != text[i]: res.append((text[i], 'o')) else: if len(target) == 1: res.append((text[i], 'b-{}'.format(label))) found = True else: good = True for j in range(1, len(target)): if i + j == len(text): good = False break if target[j] != text[i + j]: good = False break if good: res.append((text[i], 'b-{}'.format(label))) for j in range(1, len(target)): res.append((text[i + 1], 'i-{}'.format(label))) i += 1 found = True i += 1 if found: return res res = list() i = 0 while i < len(text): if found or (target[0] not in text[i]): res.append((text[i], 'o')) else: for t in target: if t in text[i]: res.append((text[i], '{}-{}'.format( ('b' if t == target[0] else 'i'), label))) i += 1 found = True if i == len(text): break else: break i -= 1 i += 1 if found: return res res.append(('</s>', 'o')) for t in target: res.append((t, '{}-{}'.format(('b' if t == target[0] else 'i'), label))) return res
import re from nltk.tokenize import TweetTokenizer from nltk.corpus import stopwords STOPWORDS = set(stopwords.words("english")) TKNZR = TweetTokenizer() def remove_url(txt): """Replace URLs found in a text string with nothing (i.e. it will remove the URL from the string). Parameters ---------- txt : string A text string that you want to parse and remove urls. Returns ------- The same txt string with url's removed. """ return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", txt).split()) def tweet_toks(tweet): tweet = remove_url(tweet.lower()) tweet_words = TKNZR.tokenize(tweet) tweet_words = [x for x in tweet_words if x not in STOPWORDS] return tweet_words
"who've" : "who have", "won't" : "will not", "wouldn't" : "would not", "you'd" : "you would", "you'll" : "you will", "you're" : "you are", "you've" : "you have", "'re": " are", "wasn't": "was not", "we'll":" will", "didn't": "did not", "tryin'":"trying" } ## Tokenizers tokenizer = TweetTokenizer() lem = WordNetLemmatizer() engstopwords = set(stopwords.words('english')) punc = string.punctuation ## emojis wriiten using brakets remove this emojis emo = {'):','(:',':',':-)',':))'} ## Cleaning the data def data_cleaning(data): #Remove email re_email = re.compile(r'[\w.-]+@[\w.-]+') data= re_email.sub(r'',data) #Remove Emoji data = demoji.replace(data,repl='') #Remove webiste reg_website = re.compile(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([\w+-a-zA-Z0-9()@:%_\+.~#?&//=]*)')
def processtweets(dirPath, limitStr): # convert the limit argument from a string to an int limit = int(limitStr) # initialize NLTK built-in tweet tokenizer twtokenizer = TweetTokenizer() os.chdir(dirPath) f = open('./downloaded-tweeti-b-dist.tsv', 'r') # loop over lines in the file and use the first limit of them # assuming that the tweets are sufficiently randomized tweetdata = [] for line in f: if (len(tweetdata) < limit): # remove final end of line character line = line.strip() # each line has 4 items separated by tabs # ignore the tweet and user ids, and keep the sentiment and tweet text tweetdata.append(line.split('\t')[2:4]) # create list of tweet documents as (list of words, label) # where the labels are condensed to just 3: 'pos', 'neg', 'neu' tweetdocs = [] # add all the tweets except the ones whose text is Not Available for tweet in tweetdata: if (tweet[1] != 'Not Available'): processedtweet = _processtweets(tweet[1]) # run the tweet tokenizer on the text string - returns unicode tokens, so convert to utf8 tokens = twtokenizer.tokenize(processedtweet) if tweet[0] == '"positive"': label = 'pos' else: if tweet[0] == '"negative"': label = 'neg' else: if (tweet[0] == '"neutral"') or (tweet[0] == '"objective"') or ( tweet[0] == '"objective-OR-neutral"'): label = 'neu' else: label = '' tweetdocs.append((tokens, label)) ## shuffle dataset and generate feature set with most common 2000 words random.shuffle(tweetdocs) all_words_list = [word for (sent, cat) in tweetdocs for word in sent] all_words = nltk.FreqDist(all_words_list) word_items = all_words.most_common(2000) word_features = [word for (word, count) in word_items] ## Base Accuracy with all the words featuresets = getfeatureset(tweetdocs, word_features) setsize = int(limit / 5) train_set, test_set = featuresets[setsize:], featuresets[:setsize] classifier = nltk.NaiveBayesClassifier.train(train_set) print("Base accuracy with all the words: " + str(nltk.classify.accuracy(classifier, test_set))) print("****** Most informative 20 features *********") classifier.show_most_informative_features(20) print("**********************************************") ## Cross validation with k fold num_folds = 5 print("Cross validation with 5 folds: ") cross_validation_accuracy(num_folds, featuresets) cross_validation_metrics(num_folds, featuresets) ## Stop words removal dirname = os.path.realpath('..') stopwordspath = dirname + '/stopwords_twitter.txt' stopwords = [line.strip() for line in open(stopwordspath)] negationwords = [ 'no', 'not', 'never', 'none', 'nowhere', 'nothing', 'noone', 'rather', 'hardly', 'scarcely', 'rarely', 'seldom', 'neither', 'nor' ] negationwords.extend([ 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn' ]) newstopwords = [word for word in stopwords if word not in negationwords] new_all_words_list = [ word for (sent, cat) in tweetdocs for word in sent if word not in newstopwords ] new_all_words = nltk.FreqDist(new_all_words_list) new_word_items = new_all_words.most_common(2000) new_word_features = [word for (word, count) in new_word_items] ## Accuracy with stop word removal new_featuresets = getfeatureset(tweetdocs, new_word_features) setsize = int(limit / 5) train_set, test_set = new_featuresets[setsize:], new_featuresets[:setsize] classifier = nltk.NaiveBayesClassifier.train(train_set) print("Accuracy with stop words removal: " + str(nltk.classify.accuracy(classifier, test_set))) ## Subjectivity lexicon SLfeaturesets = getSLfeatureset(tweetdocs, word_features) setsize = int(limit / 5) train_set, test_set = SLfeaturesets[setsize:], SLfeaturesets[:setsize] classifier = nltk.NaiveBayesClassifier.train(train_set) print("Accuracy with Subjectivity lexicon: " + str(nltk.classify.accuracy(classifier, test_set))) ## Negation words notfeaturesets = getNOTfeatureset(tweetdocs, word_features, negationwords) setsize = int(limit / 5) train_set, test_set = notfeaturesets[setsize:], notfeaturesets[:setsize] classifier = nltk.NaiveBayesClassifier.train(train_set) print("Accuracy with negation words features: " + str(nltk.classify.accuracy(classifier, test_set))) ## Negation words and stop words removal newnotfeaturesets = getNOTfeatureset(tweetdocs, new_word_features, negationwords) setsize = int(limit / 5) train_set, test_set = newnotfeaturesets[ setsize:], newnotfeaturesets[:setsize] classifier = nltk.NaiveBayesClassifier.train(train_set) print("Accuracy with negation words features and stop words removal: " + str(nltk.classify.accuracy(classifier, test_set))) ## Opinion lexicon opinionlexfeaturesets = getopinionfeatureset(tweetdocs, word_features) train_set, test_set = opinionlexfeaturesets[ 1000:], opinionlexfeaturesets[:1000] classifier = nltk.NaiveBayesClassifier.train(train_set) print("Accuracy with Opinion lexicon: " + str(nltk.classify.accuracy(classifier, test_set))) ## Advanced experiments print("*************** Advanced experiments **************") ## sklearn SGDClassifier skfetaureset = [tweet_features(d, word_features) for (d, c) in tweetdocs] X = pd.DataFrame.from_dict(skfetaureset) y = [c for (d, c) in tweetdocs] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) clf = SGDClassifier() clf.fit(X_train, y_train) predicted = clf.predict(X_test) print("Accuracy with SGDClassifier - default parameters: " + str(np.mean(predicted == y_test))) text_clf_svm = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=6, random_state=42) text_clf_svm.fit(X_train, y_train) predicted_svm = text_clf_svm.predict(X_test) print("Accuracy with SGDClassifier: " + str(np.mean(predicted_svm == y_test))) ## Different classifiers from sklearn - ref https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html skfetaureset = [tweet_features(d, word_features) for (d, c) in tweetdocs] X = pd.DataFrame.from_dict(skfetaureset) y = [c for (d, c) in tweetdocs] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) h = .02 # step size in the mesh names = [ "Nearest Neighbors", "Linear SVM", "Decision Tree", "Random Forest", "Naive Bayes" ] classifiers = [ KNeighborsClassifier(3), SVC(kernel="linear", C=0.025), DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), GaussianNB() ] for name, clf in zip(names, classifiers): clf.fit(X_train, y_train) score = clf.score(X_test, y_test) predicted = clf.predict(X_test) print(name + " : " + str(score)) ## Different classifiers from sklearn with stop words removal - ref https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html sknewfetaureset = [ tweet_features(d, new_word_features) for (d, c) in tweetdocs ] print("Classifiers with stop word removal") X = pd.DataFrame.from_dict(sknewfetaureset) y = [c for (d, c) in tweetdocs] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) h = .02 # step size in the mesh names = [ "Nearest Neighbors", "Linear SVM", "Decision Tree", "Random Forest", "Naive Bayes" ] classifiers = [ KNeighborsClassifier(3), SVC(kernel="linear", C=0.025), DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), GaussianNB() ] for name, clf in zip(names, classifiers): clf.fit(X_train, y_train) score = clf.score(X_test, y_test) predicted = clf.predict(X_test) print(name + " : " + str(score))
def my_tokenize(s): tknzr = TweetTokenizer() return tknzr.tokenize(s)
def build_word_statistics(text_file_path, label_file_path, no_sentiment_words=True): if no_sentiment_words: logger.info("Also excluding sentiment words!") text_tokenizer = TweetTokenizer() vocab = set() with open(text_file_path) as text_file: for line in text_file: vocab.update(text_tokenizer.tokenize(line.lower())) stopwords = lexicon_helper.get_stopwords() vocab -= stopwords if no_sentiment_words: sentiment_words = lexicon_helper.get_sentiment_words() vocab -= sentiment_words logger.debug(vocab) logger.info("Vocab size after filtering: " + str(len(vocab))) labels = set() with open(label_file_path) as label_file: for label_line in label_file: label = label_line.strip() labels.add(label) logger.debug(labels) empty_template = dict() for label in labels: empty_template[label] = 0 logger.debug(empty_template) word_occurrences = dict() with open(text_file_path) as text_file, open(label_file_path) as label_file: for text_line, label_line in zip(text_file, label_file): words = text_tokenizer.tokenize(text_line.strip()) label = label_line.strip() for word in words: if len(word) > 3 and word in vocab: if word not in word_occurrences: word_occurrences[word] = empty_template.copy() word_occurrences[word][label] += 1 logger.debug(word_occurrences) label_word_scores = dict() for label in labels: word_scores = list() for word in word_occurrences: try: occurrence = word_occurrences[word] positive_count = occurrence[label] negative_count = sum(occurrence.values()) - positive_count kld = positive_count * (math.log(positive_count) / math.log(negative_count)) word_scores.append((kld, word)) except Exception: logger.debug("error while processing word '{}' for label '{}'".format(word, label)) label_word_scores[label] = word_scores logger.debug(label_word_scores) for label in label_word_scores: word_scores = label_word_scores[label] word_scores.sort(key=lambda x: x[0], reverse=True) most_correlated = [x[1] for x in word_scores[:100]] logger.info("For label '{}'".format(label)) logger.info("Most correlated words: {}".format(most_correlated))
def represent_tweet(tweets): tokens = TweetTokenizer().tokenize(tweets) frequency = defaultdict(int) for token in tokens: frequency[token] += 1 return frequency
def extract_features(tweet): features = [] tknzr = TweetTokenizer() ps = SnowballStemmer("english") # HASHTAGS COUNT list_hashtag = re.findall(r'\B(\#[a-zA-Z]+\b)(?!;)', tweet) features.append(len(list_hashtag)) # PUNCTUATION COUNT & PRESENCE count = lambda l1, l2: sum([1 for x in l1 if x in l2]) count_punct = count(tweet, set(punctuation)) features.append(count_punct) features.append(count_punct != 0) # EXCLAMATION MARK list_exclamation = re.findall(r'\!', tweet) features.append(len(list_exclamation)) features.append(list_exclamation != 0) # QUESTION MARK list_question = re.findall(r'\?', tweet) features.append(len(list_question)) features.append(list_question != 0) # REMOVE PUNCTUATION tweet = tweet.translate(str.maketrans('', '', punctuation)) # EMOJIS COUNT & PRESENCE demojized_tweet = emoji.demojize(tweet) emoji_list = re.findall(r'(:[^:]*:)', demojized_tweet) list_emoji = [emoji.emojize(x) for x in emoji_list] features.append(len(emoji_list)) features.append(len(emoji_list) != 0) # REMOVE EMOJIS for emo in list_emoji: if emo in tweet: tweet = tweet.replace(emo, "") # WORD COUNT &LENGTH: number of words, number of words without stopwords, average word length tokens = [x for x in tknzr.tokenize(tweet)] bag_of_words = [x for x in tokens if x.lower() not in stop_words] stemmed_text = [ ps.stem(x) for x in bag_of_words if x.lower() not in stop_words ] features.append(len(bag_of_words)) features.append(float(sum(map(len, bag_of_words))) / len(bag_of_words)) # SPECIFIC WORD COUNT favor_words = ['blm', 'blacklivesmatter'] against_words = ['alllivesmatter', 'bluelivesmatter'] favor = [x for x in bag_of_words if x.lower() in favor_words] against = [x for x in bag_of_words if x.lower() in against_words] features.append(len(favor)) features.append(len(favor) / len(bag_of_words)) features.append(len(favor) != 0) features.append(len(against)) features.append(len(against) / len(bag_of_words)) features.append(len(against) != 0) # WORD COUNT UPPER upper_words = [x for x in bag_of_words if x.isupper()] features.append(len(upper_words) != 0) # CHECK FOR ELONGATED WORDS regex = re.compile(r"(.)\1{2}") elongated_words = [word for word in bag_of_words if regex.search(word)] features.append(len(elongated_words)) features.append(len(elongated_words) != 0) # POS TAGS: frequency of each POS tag pos_tags = nltk.pos_tag(stemmed_text) counts = Counter(tag for word, tag in pos_tags) features.append(counts['JJ']) features.append(counts['RB']) features.append(counts['VB']) features.append(counts['NN']) #Letter features: number of letters, frequency of each letter, frequency of each capital letter features.append(len(tweet)) for letter in list(string.ascii_lowercase): features.append(len([x for x in tweet if x.lower() == letter])) for letter in list(string.ascii_uppercase): features.append(len([x for x in tweet if x == letter])) total_upper = sum(1 for x in tweet if x.isupper()) total_lower = sum(1 for x in tweet if x.islower()) features.append(total_upper / len(tweet)) features.append(total_lower / len(tweet)) return features
import argparse import math import codecs import os import sys from nltk.tokenize import TweetTokenizer #from nltk.tokenize.moses import MosesDetokenizer app = Flask(__name__) args = None translator = None #dt = MosesDetokenizer() tt = TweetTokenizer() @app.route("/api/health") def health(): return jsonify({"status": "ok"}), 200 @app.route("/api/translate", methods=['POST']) def score(): line = request.get_json()['text'] #srcTokens = tt.tokenize(line.lower()) #srcBatch = [srcTokens] #predBatch, predScore, goldScore, attn, src = translator.translate(srcBatch, None) #responses = [dt.detokenize(i, return_str=True) for i in predBatch[0]] #scores = list(predScore[0])
def preprocess_tweet(tweet): tokenizer = TweetTokenizer(preserve_case=False) return [token for token in tokenizer.tokenize(tweet) if token_is_allowed(token)]
import csv import tweepy from nltk.tokenize import TweetTokenizer from tweepy import RateLimitError from DataExtraction.Scripts import twitter_credentials import time import datetime date_now = datetime.datetime.now() tknzr = TweetTokenizer(strip_handles=True) access_token = twitter_credentials.ACCESS_TOKEN access_token_secret = twitter_credentials.ACCESS_TOKEN_SECRET consumer_key = twitter_credentials.CONSUMER_KEY consumer_secret = twitter_credentials.CONSUMER_SECRET auth = tweepy.auth.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = tweepy.API(auth, wait_on_rate_limit=True) startDate = datetime.datetime(2020, 1, 1, 0, 0, 0) endDate = datetime.datetime.now() csvFile = open('OldTimeLineTweets.csv', 'a', encoding="utf-16") # ../../../CollectedData/NDC/Pres csvWriter = csv.writer(csvFile) csvWriter.writerow(["Date", "Id", "Text", "Likes", "Re-Tweets", "Location"])
from nltk import word_tokenize from nltk.tokenize import TweetTokenizer tweet = '@Hari: This sounds good! :D' print(word_tokenize(tweet)) print(TweetTokenizer().tokenize(tweet))
def __init__(self, max_dict_size=MM_MAX_DICT_SIZE, device="cpu"): self.max_dict_size = max_dict_size self.token_to_id = {TOKEN_UNK: 0} self.next_id = 1 self.tokenizer = TweetTokenizer(preserve_case=True) self.device = device
from nltk.tokenize import TweetTokenizer import pymorphy2 NUMB_OF_FEATURES = 6 twtk = TweetTokenizer() morph = pymorphy2.MorphAnalyzer(lang='uk') POS_l = [ "Unk", "NOUN", "ADJF", "ADJS", "COMP", "VERB", "INFN", "PRTS", "GRND", "NUMR", "ADVB", "NPRO", "PRED", "PREP", "CONJ", "PRCL", "INTJ", ] cases_l = [ 'Unk', 'nomn', 'gent', 'datv', 'accs', 'ablt', 'loct', 'voct', 'gen2', 'acc2', 'loc2' ]
def process_tweet( tweet, remove_USER_URL=True, remove_punctuation=True, remove_stopwords=True, remove_HTMLentities=True, remove_hashtags=True, appostrophe_handling=True, lemmatize=True, trial=False, sym_spell=None, reduce_lengthenings=True, segment_words=True, correct_spelling=True, ): """ This function receives tweets and returns clean word-list """ ### Handle USERS and URLS ################################################ if remove_USER_URL: if trial: tweet = re.sub(r'@\w+ ?', '', tweet) tweet = re.sub(r'http\S+', '', tweet) else: tweet = re.sub(r"@USER", "<>", tweet) tweet = re.sub(r"URL", "", tweet) else: if trial: tweet = re.sub(r'@\w+ ?', '<usertoken> ', tweet) tweet = re.sub(r'http\S+', '<urltoken> ', tweet) else: tweet = re.sub(r"@USER", "<usertoken>", tweet) tweet = re.sub(r"URL", "<urltoken>", tweet) ### Remove HTML Entiti es ################################################# if remove_HTMLentities: tweet = html.unescape(tweet) ### REMOVE HASHTAGS? ##################################################### if remove_hashtags: tweet = re.sub(r'#\w+ ?', '', tweet) ### Convert to lower case: Hi->hi, MAGA -> maga ########################## tweet = tweet.lower() ### Cleaning: non-ASCII filtering, some appostrophes, separation ######### tweet = re.sub(r"’", r"'", tweet) tweet = re.sub(r"[^A-Za-z0-9'^,!.\/+-=@]", " ", tweet) tweet = re.sub(r"what's", "what is ", tweet) tweet = re.sub(r"\'s", " ", tweet) tweet = re.sub(r"\'ve", " have ", tweet) tweet = re.sub(r"n't", " not ", tweet) # tweet = re.sub(r"i'm", "i am ", tweet) tweet = re.sub(r"\'re", " are ", tweet) tweet = re.sub(r"\'d", " would ", tweet) tweet = re.sub(r"\'ll", " will ", tweet) tweet = re.sub(r",", " ", tweet) tweet = re.sub(r"\.", " ", tweet) tweet = re.sub(r"!", " ! ", tweet) tweet = re.sub(r"\/", " ", tweet) tweet = re.sub(r"\^", " ^ ", tweet) tweet = re.sub(r"\+", " + ", tweet) tweet = re.sub(r"\-", " - ", tweet) tweet = re.sub(r"\=", " = ", tweet) tweet = re.sub(r"(\d+)(k)", r"\g<1>000", tweet) tweet = re.sub(r":", " : ", tweet) tweet = re.sub(r" e g ", " eg ", tweet) tweet = re.sub(r" b g ", " bg ", tweet) tweet = re.sub(r" u s ", " american ", tweet) tweet = re.sub(r"\0s", "0", tweet) tweet = re.sub(r" 9 11 ", "911", tweet) tweet = re.sub(r"e - mail", "email", tweet) tweet = re.sub(r"j k", "jk", tweet) tweet = re.sub(r"\s{2,}", " ", tweet) ### Remove Punctuation ################################################### if remove_punctuation: translator = str.maketrans( '', '', ''.join(list(set(string.punctuation) - set("'")))) tweet = tweet.translate(translator) # Tokenize sentence for further word-level processing tokenizer = TweetTokenizer() words = tokenizer.tokenize(tweet) ### Apostrophe handling: you're -> you are ######################## APPO = { "aren't": "are not", "can't": "cannot", "couldn't": "could not", "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would", "he'll": "he will", "he's": "he is", "i'd": "I would", "i'll": "I will", "i'm": "I am", "isn't": "is not", "it's": "it is", "it'll": "it will", "i've": "I have", "let's": "let us", "mightn't": "might not", "mustn't": "must not", "shan't": "shall not", "she'd": "she would", "she'll": "she will", "she's": "she is", "shouldn't": "should not", "that's": "that is", "there's": "there is", "they'd": "they would", "they'll": "they will", "they're": "they are", "they've": "they have", "we'd": "we would", "we're": "we are", "weren't": "were not", "we've": "we have", "what'll": "what will", "what're": "what are", "what's": "what is", "what've": "what have", "where's": "where is", "who'd": "who would", "who'll": "who will", "who're": "who are", "who's": "who is", "who've": "who have", "won't": "will not", "wouldn't": "would not", "you'd": "you would", "you'll": "you will", "you're": "you are", "you've": "you have", "'re": " are", "wasn't": "was not", "we'll": " will" } if appostrophe_handling: words = [APPO[word] if word in APPO else word for word in words] tweet = ' '.join(words) words = tokenizer.tokenize(tweet) ### Lemmatisation: drinking -> drink ########################### if lemmatize: lemmatizer = WordNetLemmatizer() words = [lemmatizer.lemmatize(word, "v") for word in words] ### Remove stop words: is, that, the, ... ########################## if remove_stopwords: eng_stopwords = set(stopwords.words("english")) words = [w for w in words if not w in eng_stopwords] ### Reduce lengthening: aaaaaaaaah -> aah, bleeeeerh -> bleerh ################# if reduce_lengthenings: pattern = re.compile(r"(.)\1{2,}") words = [pattern.sub(r"\1\1", w) for w in words] if sym_spell and (segment_words or correct_spelling): ### Segment words: thecatonthemat -> the cat on the mat #################### if segment_words: words = [ sym_spell.word_segmentation(word).corrected_string for word in words ] ### Correct spelling: birberals -> liberals ###################### if correct_spelling: def correct_spelling_for_word(word): suggestions = sym_spell.lookup(word, Verbosity.TOP, 2) if len(suggestions) > 0: return suggestions[0].term return word words = [correct_spelling_for_word(word) for word in words] clean_tweet = " ".join(words) clean_tweet = re.sub(" ", " ", clean_tweet) clean_tweet = clean_tweet.lower() return clean_tweet
def dummy_fun(doc): return TweetTokenizer().tokenize(doc)
def runFitting(params, objects): TASK = 'binary' #TASK = 'multi' ''' Preparing data ''' featureList = [] if params["sentenceComplexityCheck"]: featureList.append("posTag") if params["embeddingsTermFreqFiltering"]: objects["freqFilter"].embeddingsEnabled = True if params["oneHotTermFreqFiltering"]: objects["freqFilter"].oneHotEnabled = True objects["liguisticFeatureExtractor"].setFeatureList(featureList) offenseval_train = './Data/train/offenseval-training-v1.tsv' #print('Reading in offenseval training data...') if TASK == 'binary': IDsTrain, Xtrain, Ytrain = helperFunctions.read_corpus( offenseval_train) else: IDsTrain, Xtrain, Ytrain = helperFunctions.read_corpus( offenseval_train, binary=False) Xtrain = helperFunctions.clean_samples(Xtrain) ''' Preparing vectorizer and classifier ''' # Vectorizing data / Extracting features #print('Preparing tools (vectorizer, classifier) ...') if params["tweetTokenization"]: count_word = transformers.CountVectorizer( ngram_range=(1, 2), stop_words=stop_words.get_stop_words('de'), tokenizer=TweetTokenizer().tokenize) else: count_word = transformers.CountVectorizer( ngram_range=(1, 2), stop_words=stop_words.get_stop_words('de')) count_char = transformers.CountVectorizer(analyzer='char', ngram_range=(3, 7)) embedder = features.Embeddings(objects["embeddings"], pool='max') vectorizer = FeatureUnion([('word', count_word), ('char', count_char), ('word_embeds', embedder)]) if len(featureList) > 0: vectorizer.transformer_list.append( ('lingFeats', objects["liguisticFeatureExtractor"])) if params["oneHotTermFreqFiltering"] or params[ "embeddingsTermFreqFiltering"]: vectorizer.transformer_list.append( ('freqFilter', objects["freqFilter"])) if params["charNgramFreqFiltering"]: objects["charFreqFilter"].oneHotEnabled = True objects["charFreqFilter"].embeddingsEnabled = False vectorizer.transformer_list.append( ('charfreqFilter', objects["charFreqFilter"])) if params["POStagCheck"]: vectorizer.transformer_list.append( ('posTagger', transformers.posTagExtractor(Xtrain, Ytrain))) # Set up SVM classifier with unbalanced class weights """ if TASK == 'binary': # cl_weights_binary = None cl_weights_binary = {'OTHER':1, 'OFFENSE':10} clf = LinearSVC(class_weight=cl_weights_binary) else: # cl_weights_multi = None cl_weights_multi = {'OTHER':0.5, 'ABUSE':3, 'INSULT':3, 'PROFANITY':4} clf = LinearSVC(class_weight=cl_weights_multi) """ clf = LinearSVC() #scaler = StandardScaler(with_mean=False) classifier = Pipeline([ ('vectorize', vectorizer), #('scale', scaler), ('classify', clf) ]) ''' Actual training and predicting: ''' print('Fitting on training data...') classifier.fit(Xtrain, Ytrain) print("storing") dump(classifier, "./Models/RUG_Offense_concatModel.joblib") ##print("cross validating") ### predicting on set aside training data #print('Predicting on set aside data...') #Yguess = classifier.predict(XcustomTest) #result = cross_validate(classifier, Xtrain, Ytrain,cv=10) #print(result) ######## #print('Predicting...') #Yguess = classifier.predict(Xtest) """ ''' Outputting in format required ''' print('Outputting predictions...') outdir = '/Users/balinthompot/RUG/Honours/HateSpeech/offenseval-rug-master/Submission' fname = 'rug_fine_2.txt' with open(outdir + '/' + fname, 'w', encoding='utf-8') as fo: assert len(Yguess) == len(Xtest_raw), 'Unequal length between samples and predictions!' for idx in range(len(Yguess)): # print(Xtest_raw[idx] + '\t' + Yguess[idx] + '\t' + 'XXX', file=fo) # binary task (coarse) print(Xtest_raw[idx] + '\t' + 'XXX' + '\t' + Yguess[idx], file=fo) # multi task (fine) print('Done.') """ return classifier
def __init__(self): self.tknzr = TweetTokenizer()
import json from nltk.tokenize import TweetTokenizer from nltk.corpus import stopwords import pyqtgraph as pg import numpy as np import re name_cand1 = "Donald" name_cand2 = "Hillary" #Tags that correspond to each Candidate tags_cand1 = {"trumpforpresident":None,"lasttimetrumppaidtaxes":None,"trump4president":None,"trumptaxes":None,"trump":None,"donaldtrump":None,"makeamericagreatagain":None,"trumptales":None} tags_cand2 = {"hillaryclinton":None,"sheswithus":None,"hillaryforpresident":None,"hillary4president":None,"imwithher":None,"hillary":None,"clinton":None} tknzr = TweetTokenizer(strip_handles=True, reduce_len=True) #Adding stopwords to the pre-existing stopwords stop = set(stopwords.words('english')+["rt",":","!","-","…","?","url",".","’","\n"]) tweetNum_cand1=0 tweetNum_cand2=0 ptweet_cand1=[] ntweet_cand1=[] ptweet_cand2=[] ntweet_cand2=[] #Load Positive Words p = {} with open("positive_words.txt") as f:
def tknzrTweet(txt): tknzr = TweetTokenizer(strip_handles=True, reduce_len=True) return (tknzr.tokenize(txt))
def demo_tweets(trainer, n_instances=None, output=None): """ Train and test Naive Bayes classifier on 10000 tweets, tokenized using TweetTokenizer. Features are composed of: - 1000 most frequent unigrams - 100 top bigrams (using BigramAssocMeasures.pmi) :param trainer: `train` method of a classifier. :param n_instances: the number of total tweets that have to be used for training and testing. Tweets will be equally split between positive and negative. :param output: the output file where results have to be reported. """ from nltk.tokenize import TweetTokenizer from nltk.sentiment import SentimentAnalyzer from nltk.corpus import twitter_samples, stopwords # Different customizations for the TweetTokenizer tokenizer = TweetTokenizer(preserve_case=False) # tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True) # tokenizer = TweetTokenizer(reduce_len=True, strip_handles=True) if n_instances is not None: n_instances = int(n_instances / 2) fields = ["id", "text"] positive_json = twitter_samples.abspath("positive_tweets.json") positive_csv = "positive_tweets.csv" json2csv_preprocess(positive_json, positive_csv, fields, limit=n_instances) negative_json = twitter_samples.abspath("negative_tweets.json") negative_csv = "negative_tweets.csv" json2csv_preprocess(negative_json, negative_csv, fields, limit=n_instances) neg_docs = parse_tweets_set(negative_csv, label="neg", word_tokenizer=tokenizer) pos_docs = parse_tweets_set(positive_csv, label="pos", word_tokenizer=tokenizer) # We separately split subjective and objective instances to keep a balanced # uniform class distribution in both train and test sets. train_pos_docs, test_pos_docs = split_train_test(pos_docs) train_neg_docs, test_neg_docs = split_train_test(neg_docs) training_tweets = train_pos_docs + train_neg_docs testing_tweets = test_pos_docs + test_neg_docs sentim_analyzer = SentimentAnalyzer() # stopwords = stopwords.words('english') # all_words = [word for word in sentim_analyzer.all_words(training_tweets) if word.lower() not in stopwords] all_words = [word for word in sentim_analyzer.all_words(training_tweets)] # Add simple unigram word features unigram_feats = sentim_analyzer.unigram_word_feats(all_words, top_n=1000) sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) # Add bigram collocation features bigram_collocs_feats = sentim_analyzer.bigram_collocation_feats( [tweet[0] for tweet in training_tweets], top_n=100, min_freq=12) sentim_analyzer.add_feat_extractor(extract_bigram_feats, bigrams=bigram_collocs_feats) training_set = sentim_analyzer.apply_features(training_tweets) test_set = sentim_analyzer.apply_features(testing_tweets) classifier = sentim_analyzer.train(trainer, training_set) # classifier = sentim_analyzer.train(trainer, training_set, max_iter=4) try: classifier.show_most_informative_features() except AttributeError: print( "Your classifier does not provide a show_most_informative_features() method." ) results = sentim_analyzer.evaluate(test_set) if output: extr = [f.__name__ for f in sentim_analyzer.feat_extractors] output_markdown( output, Dataset="labeled_tweets", Classifier=type(classifier).__name__, Tokenizer=tokenizer.__class__.__name__, Feats=extr, Results=results, Instances=n_instances, )
#from os import path #from PIL import Image import sys #para resolver os emoji's import csv import nltk from nltk.tokenize import TweetTokenizer from nltk import FreqDist #import matplotlib.pyplot as plt from wordcloud import WordCloud #INSTANCIA UM OBJETO DO TIPO TweetTokenizer #strip_handles=True -> irá remover mençoes a usuários do tweet #reduce_len=True -> irá reduzir caracteres duplicados #preserve_case=False -> irá alterar a forma da escrita, deixando tudo em minúsculo tweet_tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case=False) #para tratar emoji's non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd) #ASSUNTO DESEJADO arquivo_csv = 'twitterData.csv' #pega a base de dados e transforma em dataframe dt_frame = pd.read_csv(arquivo_csv, encoding='UTF-8') #coloca todos os tweets em uma única variável de texto text = '' for val in dt_frame.tweet: text = text + val + ' '
from nltk.stem import PorterStemmer from nltk.stem import SnowballStemmer from nltk.stem import WordNetLemmatizer from nltk.corpus import stopwords import re import string nltk.download('stopwords') nltk.download('wordnet') #Global variables lemmatizer = WordNetLemmatizer() tknzr = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True) #stemmer = PorterStemmer() stemmer = SnowballStemmer("english") #snowball is faster, improvement of Porter stemmer stop_words_en = set(stopwords.words('english')) stop_words_es = set(stopwords.words('spanish')) #excluded = set(string.punctuation.replace('#','').replace('@','')) def readDataset(filename): df = pd.read_csv(filename) return df def stopWordsTextEnglish(word_tokens): words_filtered_en = [w for w in word_tokens if not w in stop_words_en] return words_filtered_en
parser.add_argument("-year", type=int, default=2014, help="VQA dataset year (2014/2017)") args = parser.parse_args() print("Loading dataset...") trainset = VQADataset(args.data_dir, year=args.year, which_set="train") validset = VQADataset(args.data_dir, year=args.year, which_set="val") testdevset = VQADataset(args.data_dir, year=args.year, which_set="test-dev") testset = VQADataset(args.data_dir, year=args.year, which_set="test") tokenizer = TweetTokenizer(preserve_case=False) print("Loading glove...") with io.open(args.glove_in, 'r', encoding="utf-8") as f: vectors = {} for line in f: vals = line.rstrip().split(' ') vectors[vals[0]] = [float(x) for x in vals[1:]] print("Mapping glove...") glove_dict = {} not_in_dict = {} for _set in [trainset, validset, testdevset, testset]: for g in _set.games: words = tokenizer.tokenize(g.question) for w in words:
self.load_state_dict(torch.load( path, map_location=lambda storage, loc: storage)[args.model_name]) print('Model Loaded') def num_all_params(self,): return sum([param.nelement() for param in self.parameters()]) df_noSPammer = pd.read_html('textMaliciousMark_Small_NotSpammer.html') df_noSPammer = df_noSPammer[0][1:] df_Spammer = pd.read_html('textMaliciousMark_Small_Spammer.html') df_Spammer = df_Spammer[0][1:] df = pd.concat([df_Spammer, df_noSPammer]) del df_noSPammer, df_Spammer tknzr = TweetTokenizer() ps = nltk.stem.PorterStemmer() def preprocessingInputData(input): allText = [i for i in input] preprocessedText = [[ps.stem(word) for word in tknzr.tokenize(re.sub(r'\d+', '', re.sub(r"http\S+|www.\S+", lambda match: urlparse(match.group( )).netloc or match.group().split("/")[0], sentence)).lower()) if word not in nltk.corpus.stopwords.words('english') and len(word) >= 3] for sentence in allText] return preprocessedText def mapFromWordToIdx(input): wholeText = [] for s in input:
def clean_text(text, remove_punt_number_special_chars=False, remove_stopwords=False, apply_stemming=False): """Clean text Args: text: (str) Text remove_punt_number_special_chars: (bool) Remove punctuations, numbers and special characters remove_stopwords: (bool) Remove stopwords apply_stemming: (bool) Apply stemming on the words on the text """ #Remove emojis text = re.sub(":[a-zA-Z\-\_]*:", "", emoji.demojize(text)) #:hear-no-evil_monkey: text = re.sub(":\w+:", "", emoji.demojize(text)) text = re.sub(":\w+\’\w+:", "", emoji.demojize(text)) #:woman's_boot: #Remove mentions, usernames (@USER) text = re.sub("\s*@USER\s*", '', text) #Remove URL text = re.sub("\s*URL\s*", '', text) #And text = re.sub("&", "and", text) text = re.sub("<", "<", text) text = re.sub(">", ">", text) text = re.sub("&", "and", text) #Replace contractions and slang of word text = re.sub("i'm", "I'm", text) text = contractions.fix(text, slang=True) #Lowercase text = text.lower() #Remove Hashtags + Words text = re.sub("#\s*\w+\s*", '', text) #Remove repeating whitespaces text = re.sub("\s[2, ]", " ", text) #Remove non ascii characters text.encode("ascii", errors="ignore").decode() #Remove punctuations, numbers and special characters (remove emoticons) if remove_punt_number_special_chars: text = re.sub('[^a-zA-Z]', ' ', text) #Tokenize text tt = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) text_tokens = tt.tokenize(text) #Remove stopwords if remove_stopwords: stopwords = set(STOPWORDS) text_tokens = [ token for token in text_tokens if token not in stopwords ] #Stemming if apply_stemming: text_stem = [stemmer.stem(token) for token in text_tokens] clean = " ".join(text_tokens) return clean
def tokenize(s): return TweetTokenizer(preserve_case=False).tokenize(s)
options_file = 'data/elmo_2x4096_512_2048cnn_2xhighway_options.json' weight_file = 'data/elmo_2x4096_512_2048cnn_2xhighway_weights' encoding_type_map = {'lstm': 'lstm', 'linear': 'linear'} model_name = 'PathBasedGCN' if evaluation_mode: logger = config_logger('evaluation/' + model_name) else: logger = config_logger('PathBasedGCN') model_path = os.getcwd() + '/models/' + model_name + '/' if not os.path.exists(model_path): os.makedirs(model_path) tokenize = TweetTokenizer().tokenize logger.info('Hop number is %s', hops) logger.info('Learning rate is %s', learning_rate) logger.info('Training epoch is %s', epochs) logger.info('Batch size is %s', batch_size) logger.info('Dropout rate is %f', dropout) logger.info('Encoding size for nodes and query feature is %s', encoding_size) query_feature_type = encoding_type_map['lstm'] logger.info('Encoding type for query feature is %s', query_feature_type) dynamic_change_learning_rate = True logger.info('Is learning rate changing along with epoch count: %s', dynamic_change_learning_rate) # tf.reset_default_graph() tf.reset_default_graph()
def tokenize(data): toked = [] tknzr = TweetTokenizer() for d in data: toked.append(tknzr.tokenize(d)) return toked
#nltk.download('stopwords') #nltk.download('wordnet') from nltk.stem.porter import PorterStemmer from nltk.tokenize import TweetTokenizer from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer import matplotlib.pyplot as plt from sklearn.cluster import KMeans vectorizer = CountVectorizer(min_df=0, stop_words=None, strip_accents='ascii') wordnet_lemmatizer = WordNetLemmatizer() #Lemmatizer Object stop_words = set(stopwords.words('english')) #stop words from english porter_stemmer = PorterStemmer() #Stemming Object df=pd.read_csv('Womens Clothing E-Commerce Reviews.csv') corpus,real,rating,product=[],[],[],[] tk = TweetTokenizer() #Tokenising Object dictionary = {} #building corpus, tokenising the reviews, removing symbols for i in df.index: if df['Department Name'][i]=='Jackets' and str(df['Recommended IND'][i])=='1': review = str(df['Review Text'][i]) real.append(review) rating.append(str(df['Rating'][i])) product.append(str(df['Clothing ID'][i])) review = re.sub("[^A-Za-z']+",' ',review) #removing all symbols review = ' '.join(tk.tokenize(review)) #tokenising corpus.append(review) print('The corpus for Jackets consists of '+str(len(corpus))+' reviews.')