def reasoning(dList): reasonList = [] tokenizer = TweetTokenizer() for tweet in dList: print tweet # tokenize words = tokenizer.tokenize(tweet) # get POS tag pos_tokens = pos_tag(words) # get name entities tree = ne_chunk(pos_tokens, binary = False) # find relations pairs = relextract.tree2semi_rel(tree) # get interesting name entities reason = [] for s, tree in pairs: reasonStr = ("%s") % tree reasonStr = reasonStr.split(" ") label = reasonStr[0].replace("(","").strip() content = "" for wordTag in reasonStr[1:]: sp = wordTag.split("/") word = sp[0].replace("(","") print word # content.append(word) content += (word + " ") # reason: [(label, content)] reason.append({"label": label, "content": content}) # reasonList [reason] if len(reason) > 0: reasonList.append({"reason": reason}) print str(len(reasonList)) + "/" + str(len(dList)) return reasonList
def nltk_tokenize(text): tokens = [] tknzr = TweetTokenizer() tokens = tknzr.tokenize(text) return tokens
def _tag_text(self, tweet_text): tokenizer = TweetTokenizer() tokens = tokenizer.tokenize(tweet_text) tagged = nltk.pos_tag(tokens) entities = nltk.chunk.ne_chunk(tagged) neList = traverse(entities) return neList
def process_tweets(file_name): ''' Person Responsible: Devin Munger + file_name: filename of tweets as returned from API based on query Extract text from file; return dataframe with tweet text, id ''' ## Create empty dataframe tweet_df = pd.DataFrame(columns = ["text", "id"]) tokenizer = TweetTokenizer(preserve_case = False, strip_handles = True) ## Read each JSON from file with open(file_name) as data_file: for entry in data_file.readlines(): tweet = json.loads(entry) tweet_id = str(tweet.get("id", "")) text = tweet.get("text", "") ## Remove links from text text = re.sub(r"http\S+", "", text) ## Remove twitter keywords text.replace("RT ", "") ## Remove handle, punctuation from tweet text text_words = filter(lambda x: x not in string.punctuation, tokenizer.tokenize(text)) ## Add tweet to dataframe tweet_df.loc[len(tweet_df)] = [" ".join(text_words), tweet_id] return tweet_df
def load_csv(): with open('Tweets.csv', 'rb') as csvfile: reader = csv.DictReader(csvfile) count = 1 reviews = [] stars = [] tknzr = TweetTokenizer() for row in reader: try: words=tknzr.tokenize(row['text']) label = 'SENT_%s' % count #print label # TaggedDocument(utils.to_unicode(row['text']).split(), [label]) # print "label:", label #labels = [label] #lab_sent = LabeledSentence(words, label) #print lab_sent #reviews.append(TaggedDocument(utils.to_unicode(row['text']).split(), [label])) reviews.append(TaggedDocument(words, [label])) stars.append(row['airline_sentiment']) count += 1 except: continue print "final count:", count return reviews, stars
def load_data_and_labels_gameforum(): # load with open("./input/gameforum-1000.csv", 'rU') as f: rdr = csv.reader(f) dataset = list(rdr)[1:] # remove header dataset = [entry for entry in dataset if (entry[1] == '1' or entry[1] == '2' or entry[1] == '3')] # generate x tk = TweetTokenizer(reduce_len=True) x_text = [entry[0] for entry in dataset] x_text = [clean_str(post) for post in x_text] x_text = [tk.tokenize(post) for post in x_text] # generate y y = [entry[1] for entry in dataset] for idx, label in enumerate(y): if label == '1': # positive y[idx] = [1, 0, 0] elif label == '2': # neutral y[idx] = [0, 1, 0] elif label == '3': # negative y[idx] = [0, 0, 1] else: print 'wrong label in gameforum: ' + label return [x_text, y]
def load_tweetkeywords(): """ Check and see which keywords are used in each tweet, and load the association table linking tweets and keywords """ # TweetKeyword.query.delete() tweets = Tweet.query.all() keyword_query = Keyword.query.all() keywords = [] [keywords.append(word.keyword) for word in keyword_query] tknzr = TweetTokenizer() for tweet in tweets: tokenized_tweets = tknzr.tokenize(tweet.text) for token in tokenized_tweets: if token in keywords: tweet_id = Tweet.query.filter(Tweet.tweet_id == tweet.tweet_id).one() keyword_id = Keyword.query.filter(Keyword.keyword == token).one() tweet_keyword = TweetKeyword(keyword_id=keyword_id.keyword_id, tweet_id=tweet_id.tweet_id) print "Added to TweetKeyword table: {}".format(tweet_keyword.keyword_id) db.session.add(tweet_keyword) db.session.commit()
def parse(self, text): # Tokenize message tokenizer = TweetTokenizer() words = tokenizer.tokenize(text) retweet_term = 'RT' urls = [] users = [] hash_tags = [] for word in words: if (word[0] == '@'): # user in Twitter users.append(word) elif (word[0] == '#'): # hash tags hash_tags.append(word) elif (word.find('http:') == 0 or word.find('https:') == 0): # url urls.append(word) for f in urls + users + hash_tags + [retweet_term]: if f in words: words.remove(f) self.words = words self.urls = urls self.users = users self.hash_tags = hash_tags
def format_text(entries, LSTM_shape=True): THIS_FOLDER = str(os.path.dirname(os.path.abspath(__file__))) sentences = [] tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') decoded = base64.b64decode(entries) decoded = str(decoded) decoded = decoded[2:] decoded = decoded[:-1] decoded = decoded.split(".") #print(decoded, "is decoded") for entry in decoded: token_sentences = tokenizer.tokenize(entry) for sentence in token_sentences: sentences.append(sentence) tokenized_sentences = [] #remove_tokens = ['%', ']', '[', '.', ',', '?', '!', '\''] #remove_tokens = string.punctuation remove_tokens = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' stop_words = set(stopwords.words('english')) tweet_tknzr = TweetTokenizer() for sentence in sentences: tokens = tweet_tknzr.tokenize(sentence) tokens = list(filter(lambda a: a not in remove_tokens and a not in stop_words, tokens)) tokenized_sentences.append(tokens) all_ngrams1 = np.load(THIS_FOLDER+'/ngrams1.npy').item() all_ngrams2 = np.load(THIS_FOLDER+'/ngrams2.npy').item() all_ngrams3 = np.load(THIS_FOLDER+'/ngrams3.npy').item() #once the model gets updated with good data, ngrams.py needs to get changed/updated too! X = np.zeros((len(sentences), len(all_ngrams1)+len(all_ngrams2)+len(all_ngrams3))) for i in range(len(tokenized_sentences)): sentence = tokenized_sentences[i] my_ngrams = ngrams(sentence, 1) for gram in my_ngrams: if gram in all_ngrams1: index = all_ngrams1[gram] X[i][index] = 1 for i in range(len(tokenized_sentences)): sentence = tokenized_sentences[i] my_ngrams = ngrams(sentence, 2) for gram in my_ngrams: if gram in all_ngrams2: index = len(all_ngrams1) + all_ngrams2[gram] X[i][index] = 1 for i in range(len(tokenized_sentences)): sentence = tokenized_sentences[i] my_ngrams = ngrams(sentence, 3) for gram in my_ngrams: if gram in all_ngrams3: index = len(all_ngrams1) + len(all_ngrams2) + all_ngrams3[gram] X[i][index] = 1 if LSTM_shape: X = np.reshape(X, (X.shape[0], 1, X.shape[1])) else: X = np.reshape(X, (X.shape[0], X.shape[1])) return X
def preprocess_db(): tkn = TweetTokenizer() photos = pd.read_pickle(r'./data/restaurant_photos_with_labels.pkl') img_path = r'./data/restaurant_photos/' sentid = 1 img_list = [] # Split data in such a way that labels are evenly distributed between 6 folds skf = StratifiedKFold(photos['label'], n_folds=6) folds = [] # Initialize all images to train dataset initially photos['split'] = ['train' for i in range(len(photos))] # Obtain the indices for the test and validation splits and change value appropriately for _, test_ix in skf: folds.append(test_ix) photos.split[folds[0]] = 'test' photos.split[folds[1]] = 'val' # Obtain the information from each picture and move the pictures to the appropriate dir. The images are renamed. for i, photo_id in enumerate(photos.photo_id): img_dict = dict() img_dict['sentids'] = [sentid] img_dict['business_id'] = photo_id.business_id[i] if photos.split[i] in ['train']: img_dict['filepath'] = u'train' img_dict['imgid'] = 0 img_dict['split'] = u'train' shutil.copy(img_path + photo_id + '.jpg', './data/restaurant_photos_split/train/' + str(sentid).zfill(6) + '.jpg') elif photos.split[i] in ['test']: img_dict['filepath'] = u'test' img_dict['imgid'] = 0 img_dict['split'] = u'test' shutil.copy(img_path + photo_id + '.jpg', './data/restaurant_photos_split/test/' + str(sentid).zfill(6) + '.jpg') else: img_dict['filepath'] = u'val' img_dict['imgid'] = 0 img_dict['split'] = u'val' shutil.copy(img_path + photo_id + '.jpg', './data/restaurant_photos_split/val/' + str(sentid).zfill(6) + '.jpg') img_dict['label'] = photos.label[i] caption_dict = dict() if photos.caption[i]: # Tokenize the captions caption_dict['tokens'] = tkn.tokenize(photos.caption[i]) caption_dict['raw'] = photos.caption[i] else: caption_dict['tokens'] = 'None' caption_dict['raw'] = 'None' caption_dict['imgid'] = 0 caption_dict['sentid'] = sentid img_dict['sentences'] = [caption_dict] img_dict['photoid'] = sentid img_dict['yelpid'] = photo_id img_list.append(img_dict) sentid += 1 # Store the new dataset as a JSON file with open("./data/image_caption_dataset.json", "w") as outfile: json.dump(img_list, outfile)
def check(): check_id = request.args.get("id") if check_id is not None: check_sentence = Sentence.query.get(check_id) if check_sentence is not None: Word.query.filter_by(sentence_id=check_id).delete() tweet_tokenizer = TweetTokenizer() tokens = tweet_tokenizer.tokenize(check_sentence.text) for token in tokens: url = "http://kateglo.com/api.php?format=json&phrase="+token resp = requests.get(url) exist = False if (resp.ok): try: resp_json = json.loads(resp.content) exist = True except ValueError: exist = False word = Word(check_sentence.id, token, exist) db.session.add(word) db.session.commit() sentences = Sentence.query.all() c = ((sentence.id, sentence.source, sentence.text, ((w.word, w.exist,) for w in sentence.words.all()), ) for sentence in sentences) return render_template('check.html', rows=c)
def get_tweet_tags(tweet): """ Break up a tweet into individual word parts """ tknzr = TweetTokenizer() tokens = tknzr.tokenize(tweet) # replace handles with real names for n, tok in enumerate(tokens): if tok.startswith('@'): handle = tok.strip("@") if handle in user.students: # If we have a database entry for the mentioned user, we can # easily substitute a full name. usr = user.NPUser(handle) tokens[n] = usr.fullname else: # If there is no database entry, we use the user's alias. While # this is the full name in many cases, it is often not reliable usr = api.get_user(handle) tokens[n] = usr.name tagged = nltk.pos_tag(tokens) # In nltk, if a teacher's name is written with a period after an # abbreviated prefix, it is awkwardly broken up into 3 tags for n, tag in enumerate(tagged): # If there is the weird period after the prefix, if tag[1] == '.': # and it is in fact splitting up a person's name, if tagged[n - 1][1] == 'NNP' and tagged[n + 1][1] == 'NNP': if tagged[n - 1][0] in ['Mr', 'Ms', 'Mrs', 'Mx']: # combine it into the actual name, tagged[n - 1] = ('{}. {}'.format(tagged[n - 1][0], tagged[n + 1][0]), 'NNP') # and then remove the extra tags. del tagged[n + 1] del tagged[n] return tagged
def load_data_and_labels_sam(): # load with open("./input/2780_freshmen_tweets.csv", 'rU') as f: rdr = csv.reader(f) dataset = list(rdr)[1:] # remove header # filter out tweets with unknown sentiment dataset = [entry for entry in dataset if entry[4] != '0'] # generate x tk = TweetTokenizer(reduce_len=True) x_text = [entry[3] for entry in dataset] x_text = [clean_str(tweet) for tweet in x_text] x_text = [tk.tokenize(tweet) for tweet in x_text] # generate y y = [entry[4] for entry in dataset] for idx, label in enumerate(y): if label == '1': # positive y[idx] = [1, 0, 0] elif label == '2': # neutral y[idx] = [0, 1, 0] elif label == '3': # negative y[idx] = [0, 0, 1] else: print 'wrong label in sam: ' + label return [x_text, y]
def preprocess_tweets(event_date, dt=datetime.timedelta(seconds=30), match=None, tweet_processor=None, match_type='home'): import collections tknzr = TweetTokenizer() dbname = match['dbname'] collname_home = match['collname_home'] collname_away = match['collname_away'] home_team = match['home_team'] away_team = match['away_team'] if match_type == 'home': coll = client[dbname][collname_home] else: coll = client[dbname][collname_away] # add some padding to the start and end times date_start = event_date - dt date_end = event_date + dt query = { "created_at": {"$gt": date_start, "$lt": date_end}} results = coll.find( query ) clean_tweets = [] for result in results: tweet_id = result['id_str'] tweet_split = tweet_processor.preprocess(result['text'].encode('ascii', 'ignore')) parts = tknzr.tokenize(tweet_split) clean = [i for i in parts if i not in stop] clean_text = " ".join (clean) clean_tweets.append( (clean_text, tweet_id) ) return clean_tweets
def createDataset(filename, MAX_VOCAB_SIZE): yaks = [] tokenizer = TweetTokenizer() ids = set() numyaks = 0 for line in open(filename).readlines(): stuff = line.split(":::") id = stuff[0] if len(stuff) > 3 and id not in ids: numyaks+=1 sentence = stuff[3] ids.add(id) tokens = [START_TOKEN] tokens.extend(tokenizer.tokenize(sentence.lower())) tokens.append(END_TOKEN) yaks.append(tokens) token_frequency = nltk.FreqDist(itertools.chain(*yaks)) vocab = token_frequency.most_common(MAX_VOCAB_SIZE-1) i2t = [token[0] for token in vocab] i2t.append(UNKNOWN_TOKEN) t2i = dict() for i,t in enumerate(i2t): t2i[t] = i yaks = [[t if t in t2i else UNKNOWN_TOKEN for t in yak] for yak in yaks] Xtrain = np.asarray([[t2i[token] for token in yak[:-1]] for yak in yaks]) Ytrain = np.asarray([[t2i[token] for token in yak[1:]] for yak in yaks]) print "Num unique Yaks: "+str(numyaks) return (Xtrain, Ytrain, i2t, t2i)
def getTweetTokens(classification, toRead, info, tags): i=0 tknzr = TweetTokenizer() with open(toRead) as f: content = f.readlines() c = 0 for item in content: #adapt the list into python dictionary format content[c] = item.replace("null", "None") content[c] = content[c].replace("false", "False") content[c] = content[c].replace("true", "True") c+=1 for i in range(len(content)): tweet = eval(content[i])["text"] tokenTweet = tknzr.tokenize(tweet) j = 0 k = 0 while j < (len(tokenTweet) - k): #print j if tokenTweet[j][0] == "#": tokenTweet[j] = tokenTweet[j][1:] elif tokenTweet[j][0] == "@": del tokenTweet[j] j-=1 k+=1 j+=1 info.append((word_feats(tokenTweet), classification))
def main(): text = sys.stdin.read().decode("utf-8") tknzr = TweetTokenizer() tok = tknzr.tokenize(text) saved_object = construct_dict(tok) print json.dumps(saved_object)
def keywords_search(reviews): key_map = {} # for k in open(os.getcwd() + "/KeyWord/keyword_map_general.txt", 'r'): for k in open(keyword_general_path, 'r'): a = k.strip().split(", ") key_map[a[0]] = a[1] special_map = {} # for k in open(os.getcwd() + "/KeyWord/keyword_map_special.txt", 'r'): for k in open(keyword_special_path, 'r'): a = k.strip().split(", ") special_map[a[0]] = a[1] raw = reviews.lower() tokenizer = TweetTokenizer() tokens = tokenizer.tokenize(raw) # remove punctuations no_punc_tokens = [i for i in tokens if (not i in string.punctuation+string.digits) and (not "." in i)] # remove stop words from tokens en_stop = get_stop_words('en') stopped_tokens = [i for i in no_punc_tokens if not i in en_stop] # stem tokens # wordnet_lemmatizer = WordNetLemmatizer() # stemmed_tokens = [wordnet_lemmatizer.lemmatize(i) for i in stopped_tokens ] chosen_key_words = [] # Search in general key word key_words_dict = dict.fromkeys(key_map.values(), 0) # Select keyword use only key word to select # s = set(stemmed_tokens) s = set(stopped_tokens) for t in key_map.keys(): if t in s: key_words_dict[key_map[t]] += 1 for d in sorted(zip(key_words_dict.values(), key_words_dict.keys()))[:-4:-1]: if d[0] > 0: chosen_key_words.append(d[1]) # Search in special keyword special_words_dict = dict.fromkeys(special_map.values(), 0) # Select keyword using wordnet # Select keyword use only key word to select # s = set(stemmed_tokens) s = set(stopped_tokens) for t in special_map.keys(): if t in s: special_words_dict[special_map[t]] += 1 for d in sorted(zip(special_words_dict.values(), special_words_dict.keys()))[:-3:-1]: if d[0] > 0: chosen_key_words.append(d[1]) return ' '.join(chosen_key_words)
def get_utterances(utterances, line, category, wgram, cgram): tknzr = TweetTokenizer() gram_list = [] # WORD GRAMS if wgram == 1: # unigram wgram_list = tknzr.tokenize(line) elif wgram == 2: # uni + bigram # unigram list tokens = nltk.wordpunct_tokenize(line) # bigram list finder = BigramCollocationFinder.from_words(tokens) scored = finder.score_ngrams(bigram_measures.raw_freq) bigram_list = sorted(bigram for bigram, score in scored) # res wgram_list = tknzr.tokenize(line) + bigram_list elif wgram == 3: # uni + bi + trigram # unigram list tokens = nltk.wordpunct_tokenize(line) # bigram list bi_finder = BigramCollocationFinder.from_words(tokens) bi_scored = bi_finder.score_ngrams(bigram_measures.raw_freq) bigram_list = sorted(bigram for bigram, biscore in bi_scored) # trigram list tri_finder = TrigramCollocationFinder.from_words(tokens) tri_scored = tri_finder.score_ngrams(trigram_measures.raw_freq) trigram_list = sorted(trigram for trigram, triscore in tri_scored) # res wgram_list = tknzr.tokenize(line) + bigram_list + trigram_list # CHAR GRAMS cgram_list = [] if cgram == 1: # uni-chargram cgram_list = [line[i:i+1] for i in range(len(line)-1)] elif cgram == 2: # bi-chargram cgram_list = [line[i:i+2] for i in range(len(line)-1)] elif cgram == 3: # tri-chargram cgram_list = [line[i:i+3] for i in range(len(line)-1)] # RESULT if category == 'QA': # non-task utterances.append((wgram_list + cgram_list, 0)) elif category == 'Shopping': # task utterances.append((wgram_list + cgram_list, 1)) elif category == 'Travel': # task utterances.append((wgram_list + cgram_list, 2)) elif category == 'Hotel': # task utterances.append((wgram_list + cgram_list, 3)) elif category == 'Food': # task utterances.append((wgram_list + cgram_list, 4)) elif category == 'Art': # task utterances.append((wgram_list + cgram_list, 5)) elif category == 'Weather': # task utterances.append((wgram_list + cgram_list, 6)) elif category == 'Friends': # task utterances.append((wgram_list + cgram_list, 7)) elif category == 'Chat': # chat utterances.append((wgram_list + cgram_list, 8)) else: print utt_category,"ERROR"
def classify(classifier, featx, strings): print "Classify request" tokenizer = TweetTokenizer() mood = [] for string in strings: string = Twitter.process_tweet(string) tokenized_text = [word.lower() for word in tokenizer.tokenize(string)] mood.append(classifier.classify(featx(tokenized_text))) return mood
def get_lyrics(self): time.sleep(10) soup = BeautifulSoup(self.get_song_page(), 'lxml') page_lyric = soup.find_all("div", limit=22)[-1] # lyrics start on 22nd div lyrics = ''.join(page_lyric.find_all(text=True)) tknzr = TweetTokenizer() lyrics = tknzr.tokenize(lyrics) lyrics = [word for word in lyrics if word not in self.HTML_TAGS] return " ".join(lyrics[20:])
def clean_tweet(tweet): tknzr = TweetTokenizer() tweet = re.sub(r"(?:\@|https?\://)\S+", "", tweet.lower()) tweet = ' '.join(tweet.split()) words = tknzr.tokenize(tweet) words = [''.join(c for c in s if c not in punctuation) for s in words] words = [s for s in words if s] sent = " ".join(words) return sent
def load_data_and_labels_semeval(): # load the entire semeval dataset old_dataset = list(open("./input/2013-dev")) old_dataset.extend(list(open("./input/2013-devtest"))) old_dataset.extend(list(open("./input/2013-train"))) old_dataset.extend(list(open("./input/2014-devtest"))) new_dataset = list(open("./input/2016-train")) new_dataset.extend(list(open("./input/2016-dev"))) new_dataset.extend(list(open("./input/2016-devtest"))) # filter out invalid tweets from new dataset new_dataset = [entry for entry in new_dataset if entry.split('\t')[2] != 'Not Available\n'] # generate x from old tk = TweetTokenizer(reduce_len=True) # handles punctuations x_text = [entry.split('\t')[3] for entry in old_dataset] x_text = [clean_str(tweet) for tweet in x_text] x_text = [tk.tokenize(tweet) for tweet in x_text] # generate x from new x_text_new = [entry.split('\t')[2] for entry in new_dataset] x_text_new = [clean_str(tweet) for tweet in x_text_new] x_text_new = [tk.tokenize(tweet) for tweet in x_text_new] # concat x and x_new x_text.extend(x_text_new) # generate y from old y = [entry.split('\t')[2] for entry in old_dataset] for idx, label in enumerate(y): if label == 'positive': y[idx] = [1, 0, 0] elif label == 'neutral': y[idx] = [0, 1, 0] elif label == 'negative': y[idx] = [0, 0, 1] else: print 'wrong label in semeval: ' + label # generate y from new y_new = [entry.split('\t')[1] for entry in new_dataset] for idx, label in enumerate(y_new): if label == 'positive': y_new[idx] = [1, 0, 0] elif label == 'neutral': y_new[idx] = [0, 1, 0] elif label == 'negative': y_new[idx] = [0, 0, 1] else: print 'wrong label in semeval: ' + label # concat y and y_new y.extend(y_new) return [x_text, y]
def loaddata(inputfile): file = open(inputfile) tknzr = TweetTokenizer() sentences=[] while 1: line = file.readline().strip() if not line: break sentences.append(tknzr.tokenize(line)) return sentences
def count_tweets_keywords(tweets): tknzr = TweetTokenizer() wordcounts = defaultdict(int) for tweet in tweets: if "text" in tweet: word_list = tknzr.tokenize(tweet["text"]) filtered_words = [word for word in word_list if word not in stopwords.words("english")] for word in filtered_words: wordcounts[word] += 1 return wordcounts
def tokenize(file_name): """ Takes as input a file name. Tokenize the tweets separating them using nltk function. Return a list of tokens""" tokenizer = TweetTokenizer(strip_handles=True) tokens = [] file = open(file_name, 'r') for line in file: tokens.append(tokenizer.tokenize(line)) file.close() return tokens
def get_sentence_from_training_doc_regexp(filename): document = get_text_from_training_doc_regexp(filename) sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') sentences = sent_detector.tokenize(document) tknzr = TweetTokenizer() tokenized_sentences = [] for i in xrange(len(sentences)): tokens = tknzr.tokenize(sentences[i].strip()) tokenized_sentences.append(' '.join(tokens)) return tokenized_sentences
def keywords_search(reviews): key_map = {} for k in open(keyword_general_path, 'r'): a = k.strip().split(", ") key_map[a[0]] = a[1] special_map = {} for k in open(keyword_special_path, 'r'): a = k.strip().split(", ") special_map[a[0]] = a[1] # get the tokens from the review raw = reviews.lower() tokenizer = TweetTokenizer() tokens = tokenizer.tokenize(raw) # remove punctuations no_punc_tokens = [i for i in tokens if (not i in string.punctuation + string.digits) and (not "." in i)] # remove stop words from tokens en_stop = get_stop_words('en') stopped_tokens = [i for i in no_punc_tokens if not i in en_stop] chosen_key_words = ['chinese'] # Search in general key word key_words_dict = dict.fromkeys(key_map.values(), 0) # Select keyword use only key word to select s = set(stopped_tokens) for t in key_map.keys(): if t in s: key_words_dict[key_map[t]] += 1 for d in sorted(zip(key_words_dict.values(), key_words_dict.keys()))[:-4:-1]: if d[0] > 0: chosen_key_words.append(d[1]) # Search in special keyword special_words_dict = dict.fromkeys(special_map.values(), 0) # Select keyword using wordnet # Select keyword use only key word to select s = set(stopped_tokens) for t in special_map.keys(): if t in s: special_words_dict[special_map[t]] += 1 for d in sorted(zip(special_words_dict.values(), special_words_dict.keys()))[:-3:-1]: if d[0] > 0: chosen_key_words.append(d[1]) return ', '.join(chosen_key_words)
def preprocess_docs(documents): tokenizer = TweetTokenizer() english_stemmer = nltk.stem.SnowballStemmer('english') texts = [tokenizer.tokenize(d) for d in documents] stemmed_texts = [] for text in texts: stemmed_text = [english_stemmer.stem(t) for t in text] stemmed_texts.append(stemmed_text) return stemmed_texts
def test_tweet_tokenizer(self): """ Test TweetTokenizer using words with special and accented characters. """ tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True) s9 = "@myke: Let's test these words: resumé España München français" tokens = tokenizer.tokenize(s9) expected = [':', "Let's", 'test', 'these', 'words', ':', 'resumé', 'España', 'München', 'français'] self.assertEqual(tokens, expected)
from nltk.tokenize import LineTokenizer, SpaceTokenizer, TweetTokenizer from nltk import word_tokenize line = "My name is Venkatram Veerareddy, technical architect.\n I am having 20 years of experience in "\ " Software industry working \nfrom applications to products by using \n" \ " C, C++, Java, Javascript and databases "\ " like Oracle, MS SQL Server, Postgres, MySQL and OrientDB." lTokenizer = LineTokenizer() print("Line tokenizer output: ", lTokenizer.tokenize(line)) sTokenizer = SpaceTokenizer() print("Space Tokenizer output: ", sTokenizer.tokenize(line)) print("Word Tokenizer output: ", word_tokenize(line)) tTokenizer = TweetTokenizer() print("Tweet Tokenizer output: ", tTokenizer.tokenize("This is a coooool #dummysmiley: :-) :-P <3"))
class Processing: def __init__(self, read_n_write): self.happy_emoticons = read_n_write.read_any_list( './resources/happy_emoticons.txt') self.sad_emoticons = read_n_write.read_any_list( './resources/sad_emoticons.txt') self.slang = read_n_write.read_any_list('./resources/slang.txt') self.wnl = nltk.WordNetLemmatizer() self.tokenizer = TweetTokenizer() self.stop_words = set(stopwords.words('english')) self.stop_words.update([ 'url', "i'm", '@name', "@name's", "that's", "doesn't", 'u', 'would', 'else', 'anyone', "can't", "what's", "i've", 'could', "they're" ]) self.happy_emoticons_count = 0 self.sad_emoticons_count = 0 self.emoji_count = 0 self.slang_count = 0 self.stopwords_count = 0 self.emoji_list = [] self.emoji_list.extend(self.happy_emoticons) self.emoji_list.extend(self.sad_emoticons) self.punc_list = set(string.punctuation) self.ngrams = {} self.ngrams_pos = {} def write_unk_emoji(self): print('Printing not emotion annotated emoticon list') na_emoticon = list( set(self.emoji_list) - set(self.happy_emoticons).union(set(self.sad_emoticons))) print(na_emoticon) for emoticon in na_emoticon: print(emoticon + '\t' + self.emoji_dict[emoticon]) def process(self, tweets): self.happy_emoticons_count = 0 self.sad_emoticons_count = 0 self.emoji_count = 0 self.slang_count = 0 self.stopwords_count = 0 list_tweets, list_tweet_lemmas = self.processing_pos(tweets) final_tweet_lemmas = [] final_tweets = [] for list_token, list_lemmas in zip(list_tweets, list_tweet_lemmas): single_tweet_lemma = [] single_tweet = [] for word, lemma in zip(list_token, list_lemmas): if self.all_count(word, lemma): word = word.lower() single_tweet.append(word) single_tweet_lemma.append(lemma) final_tweet_lemmas.append(single_tweet_lemma) final_tweets.append(single_tweet) final_features = [ self.happy_emoticons_count, self.sad_emoticons_count, self.emoji_count, self.slang_count, self.stopwords_count ] return final_features, final_tweets, final_tweet_lemmas def all_count(self, word, lemma): if word.strip() == '': return False emoji_flag = False try: if word in emoji.UNICODE_EMOJI or word in self.emoji_list: emoji_flag = True new_emoji = emoji.demojize(word) self.emoji_count += 1 if not new_emoji in self.emoji_list: self.emoji_list.append(new_emoji) if new_emoji in self.happy_emoticons: self.happy_emoticons_count += 1 elif new_emoji in self.sad_emoticons: self.sad_emoticons_count += 1 except: pass if word in self.slang or lemma in self.slang: self.slang_count += 1 if word in self.stop_words or lemma in self.stop_words: self.stopwords_count += 1 return False if word in self.punc_list: return False if emoji_flag: return False return True def processing_pos(self, tweets): list_lemmas = [] list_tokens = [] for tweet in tweets: words = [] lemmas = [] for word in self.tokenizer.tokenize(tweet): lemma = self.wnl.lemmatize(word.lower()) if word in self.stop_words or word in self.punc_list or lemma in self.stop_words: continue else: lemmas.append(lemma) words.append(word) list_lemmas.append(lemmas) list_tokens.append(words) return list_tokens, list_lemmas def processing_lemma(self, list_of_sent): list_output = [] for sent in list_of_sent: words = [] for word in self.tokenizer.tokenize(sent): word = self.wnl.lemmatize(word.lower()) if word in self.stop_words or word in self.punc_list: continue else: words.append(word) list_output.append(words) return list_output
Django settings for LCBweb project. Generated by 'django-admin startproject' using Django 2.1. For more information on this file, see https://docs.djangoproject.com/en/2.1/topics/settings/ For the full list of settings and their values, see https://docs.djangoproject.com/en/2.1/ref/settings/ """ from nltk.tokenize import TweetTokenizer import os from home.beliefEng.Belief_tagger import modelLoader MODEL = modelLoader() TOKENIZER = TweetTokenizer() # Build paths inside the project like this: os.path.join(BASE_DIR, ...) BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # Quick-start development settings - unsuitable for production # See https://docs.djangoproject.com/en/2.1/howto/deployment/checklist/ # SECURITY WARNING: keep the secret key used in production secret! SECRET_KEY = '!q+5d6*yxnlrb-f5@n%9__c!gw&zf4mw9y+)drcodbq1@q@71$' # SECURITY WARNING: don't run with debug turned on in production! DEBUG = True ALLOWED_HOSTS = []
import CRF.definitions as definitions from sklearn.model_selection import StratifiedKFold from CMUTweetTagger import runtagger_parse from spacy.language import Tokenizer, GoldParse from spacy.tokenizer import Tokenizer from spacy.attrs import ORTH, LEMMA import spacy from sklearn.linear_model import SGDClassifier nlp = spacy.load("en_core_web_sm") tokenizer = Tokenizer(nlp.vocab) lancaster_stemmer = LancasterStemmer() wordnet_lemmatizer = WordNetLemmatizer() tknzr = TweetTokenizer(preserve_case=True, strip_handles=False, reduce_len=False) stop = set(stopwords.words('english')) def get_tuples(dspath): sentences = [] s = '' tokens = [] ners = [] poss = [] tot_sentences = 0 ners_by_position = [] index = 0 with open(dspath) as f: for line in f:
totRepetitions = 0 d = collections.defaultdict(int) for c in tw: d[c] += 1 for c in sorted(d, key=d.get, reverse=True): if d[c] > 1: totRepetitions = totRepetitions + d[c] wordLength = sum(1 for c in tw) repPercent = totRepetitions / wordLength return repPercent #client = corenlp.CoreNLPClient ( start_server=False , annotators="sentiment".split ( ) ) #Preprocessing and Tokenization tk = TweetTokenizer() p = Preprocess() text_processor = TextPreProcessor( # terms that will be normalized normalize=[ 'email', 'percent', 'money', 'phone', 'time', 'url', 'date', 'number' ], fix_html=True, # fix HTML tokens segmenter="twitter", corrector="twitter", unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=True, # spell correction for elongated words dicts=[emoticons]) #REPLACE with SPECIAL TAGS
import re import csv import time import os, sys, codecs from nltk.text import Text from nltk.tokenize import TweetTokenizer from nltk.corpus import stopwords from collections import Counter from nltk.stem import WordNetLemmatizer import time start = time.time() lemmatizer = WordNetLemmatizer() tweet = TweetTokenizer(strip_handles=True) # Setting Stopwords stop_words = set(stopwords.words('english')) # Updating stop words with punctuation stop_words.update([ '.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', '/', '-', '~', '&', '*', '<', '>', '=', '%' ]) # updating stopwords with links stop_words.update(['http', 'httpbitly', 'httptinyurl', '://']) # updating stopwords with Expressions and words of no impact stop_words.update([ '่', 'ã€', 'ã€', '。', 'ã€', 'é', '|', 'ï¼', '…', '’', '่', '^', ',', ')', '้', 'ั', '#p2', '。', '’', '#tcot', 'ั', 'ã€', '่', 'via', 'ã€' ])
class MultimodalPreprocessor: log = logging.getLogger("MulitmodalPreprocessor") def __init__(self, max_dict_size=MM_MAX_DICT_SIZE): self.max_dict_size = max_dict_size self.token_to_id = {TOKEN_UNK: 0} self.next_id = 1 self.tokenizer = TweetTokenizer(preserve_case=True) def __len__(self): return len(self.token_to_id) def __call__(self, batch, cuda=False, device_id=None): """ Convert list of multimodel observations (tuples with image and text string) into the form suitable for ModelMultimodal to disgest :param batch: """ tokens_batch = [] for img_obs, txt_obs in batch: tokens = self.tokenizer.tokenize(txt_obs) idx_obs = self.tokens_to_idx(tokens) tokens_batch.append((img_obs, idx_obs)) # sort batch decreasing to seq len tokens_batch.sort(key=lambda p: len(p[1]), reverse=True) img_batch, seq_batch = zip(*tokens_batch) lens = list(map(len, seq_batch)) # convert data into the target form # images img_v = Variable(torch.from_numpy(np.array(img_batch))) # sequences seq_arr = np.zeros(shape=(len(seq_batch), max(len(seq_batch[0]), 1)), dtype=np.int64) for idx, seq in enumerate(seq_batch): seq_arr[idx, :len(seq)] = seq # Map empty sequences into single #UNK token if len(seq) == 0: lens[idx] = 1 seq_v = Variable(torch.from_numpy(seq_arr)) if cuda: img_v = img_v.cuda(device_id=device_id) seq_v = seq_v.cuda(device_id=device_id) seq_p = rnn_utils.pack_padded_sequence(seq_v, lens, batch_first=True) return img_v, seq_p def tokens_to_idx(self, tokens): res = [] for token in tokens: idx = self.token_to_id.get(token) if idx is None: if self.next_id == self.max_dict_size: self.log.warning( "Maximum size of dict reached, token '%s' converted to #UNK token", token) idx = 0 else: idx = self.next_id self.next_id += 1 self.token_to_id[token] = idx res.append(idx) return res def save(self, file_name): with open(file_name, 'wb') as fd: pickle.dump(self.token_to_id, fd) pickle.dump(self.max_dict_size, fd) pickle.dump(self.next_id, fd) @classmethod def load(cls, file_name): with open(file_name, "rb") as fd: token_to_id = pickle.load(fd) max_dict_size = pickle.load(fd) next_id = pickle.load(fd) res = MultimodalPreprocessor(max_dict_size) res.token_to_id = token_to_id res.next_id = next_id return res
TASK = "A" # Define, A or B FNAME = './predictions-task' + TASK + '.txt' PREDICTIONSFILE = open(FNAME, "w") K_FOLDS = 10 # 10-fold crossvalidation CLF = LinearSVC() # the default, non-parameter optimized linear-kernel SVM # Loading dataset and featurised simple Tfidf-BoW model corpus, y = parse_dataset(DATASET_FP) X, vectorizer = featurize(corpus) class_counts = np.asarray(np.unique(y, return_counts=True)).T.tolist() print(class_counts) print(corpus) tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True).tokenize tokens = tokenizer('\n'.join(corpus)) finder = BigramCollocationFinder.from_words(tokens) bigram_measures = BigramAssocMeasures() scored = finder.score_ngrams(bigram_measures.student_t) sorted(bigram for bigram, score in scored) map(lambda x: print(' '.join(x[0]), x[1]), scored[:10]) CLF.fit(X, y) # Returns an array of the same size as 'y' where each entry is a prediction obtained by cross validated predicted = cross_val_predict(CLF, X, y, cv=K_FOLDS) most_informative_feature_for_binary_classification(vectorizer, CLF, n=10) # Modify F1-score calculation depending on the task
class SocialTextProcessor(AbstractDataProcessor): english_stopwords = english_stopwords.words() english_dictionary = dict.fromkeys(nltk_words.words(), None) my_stopwords = ["still", "just", "emoji", "open", "go", "coin", "see"] lemmatizer = WordNetLemmatizer() stemmer = PorterStemmer() t_word_tokenizer = TweetTokenizer() sentence_tokenizer = PunktSentenceTokenizer() @classmethod def process_document(cls, document): document = cls.html_processing(document) #tokenize words = cls.t_word_tokenizer.tokenize(document) #print(" \n Tokenizing: {} \n".format(words)) #expand contractions words = cls.expand_contractions(words) #print("Expanding contractions: {} \n".format(words)) # to lowercase words = list(map(str.lower, words)) tagged_sentence = pos_tag(words) proper_nouns_tags = ['IN', 'NNP', 'PRP', 'PRP$', 'WP$'] tagged_sentence = [(word, tag) for word, tag in tagged_sentence if tag not in proper_nouns_tags] #print("Filtering tags: {} \n".format(tagged_sentence)) words = [] for word, tag in tagged_sentence: wordnet_tag = cls.find_wordnet_tag(tag) if wordnet_tag != '': word = cls.remove_apos(word) words.append( cls.lemmatizer.lemmatize(word.lower(), wordnet_tag)) elif word in string.punctuation: words.append(word) #print("Lemmatize: {} \n".format(words)) # must be reviewed words = [ word for word in words if word not in string.punctuation and len(word) > 1 and cls.is_english_word(word.lower()) ] #print("Punctuation and english: {} \n".format(words)) words = mark_negation(words) #print("Negation: {} \n".format(words)) stop_wrods = set(cls.english_stopwords + cls.my_stopwords) words = [word for word in words if word.lower() not in stop_wrods] #print("Stop words: {} \n".format(words)) return words @classmethod def remove_apos(cls, text): while "'" in text: text = text.replace("'", "") return text @classmethod def expand_contractions(cls, words): expanded_words = [] for word in words: if word.lower() in CONTRACTION_MAP.keys(): expanded_words += word_tokenize(CONTRACTION_MAP[word.lower()]) else: expanded_words.append(word) return expanded_words @classmethod def html_processing(cls, text): # remove urls text = re.sub(r"http\S+", ' ', text) # remove # text = re.sub(r'#(\S+)', r' \1 ', text) # remove digits text = re.sub(pattern=r"\d", repl=r"", string=text) # replace users, tags with empty space text = re.sub(r'@[\S]+', ' ', text) # Replace #word with empty space text = re.sub(r'#([^\s]+)', ' ', text) # remove duplicated characters text = re.sub(r'\s+', ' ', text) text = re.sub(r'(.)\1+', r'\1\1', text) return text @classmethod def find_wordnet_tag(cls, tag): if tag.startswith('J'): return wordnet.ADJ elif tag.startswith('V'): return wordnet.VERB elif tag.startswith('N'): return wordnet.NOUN elif tag.startswith('R'): return wordnet.ADV else: return '' @classmethod def is_english_word(cls, word): try: cls.english_dictionary[word] return True except KeyError: return False
def new_oracle_data(self): print("Creating New " + self.data_file_name + " File.") path = os.path.join(self.data_dir, self.data_file) tknzr = TweetTokenizer(preserve_case=False) oracle_data = dict() _id = 0 ans2tok = {'Yes': 1, 'No': 0, 'N/A': 2} with gzip.open(path) as file: for json_game in file: game = json.loads(json_game.decode("utf-8")) if self.successful_only: if not game['status'] == 'success': continue if self.history: prev_ques = list() prev_answer = list() prev_length = 0 for i, qa in enumerate(game['qas']): q_tokens = tknzr.tokenize(qa['question']) q_token_ids = [ self.word2i[w] if w in self.word2i else self.word2i['<unk>'] for w in q_tokens ][:self.max_src_length] a_token = ans2tok[qa['answer']] length = len(q_token_ids) if self.history: question = prev_ques + prev_answer + q_token_ids question_length = prev_length + length else: question = q_token_ids question_length = length if self.history: question.extend([self.word2i['<padding>']] * (self.max_diag_len - len(question))) else: question.extend([self.word2i['<padding>']] * (self.max_src_length - len(question))) for i, o in enumerate(game['objects']): if o['id'] == game['object_id']: # target object information spatial = get_spatial_feat_v2( bbox=o['bbox'], im_width=game['image']['width'], im_height=game['image']['height']) object_category = o['category_id'] break oracle_data[_id] = dict() oracle_data[_id]['question'] = question oracle_data[_id]['length'] = question_length oracle_data[_id]['answer'] = a_token oracle_data[_id]['image_file'] = game['image']['file_name'] oracle_data[_id]['spatial'] = spatial oracle_data[_id]['game_id'] = str(game['id']) oracle_data[_id]['obj_cat'] = object_category prev_ques = copy.deepcopy(q_token_ids) prev_answer = [copy.deepcopy(a_token)] prev_length = length + 1 _id += 1 oracle_data_path = os.path.join(self.data_dir, self.data_file_name) with io.open(oracle_data_path, 'wb') as f_out: data = json.dumps(oracle_data, ensure_ascii=False) f_out.write(data.encode('utf8', 'replace')) print('done') with open(oracle_data_path, 'r') as file: oracle_data = json.load(file) return oracle_data
inv_target_dict[i] = tar i += 1 x = set() with open("../train dataset/Stance.csv", "rb") as f: for row in f: x.add(row.strip()) x = list(x) i = 0 for tar in x: stance_dict[tar] = i inv_stance_dict[i] = tar i += 1 # print target_dict,stance_dict tknzr = TweetTokenizer() x_train, y_train = [[] for i in range(5)], [[] for i in range(5)] X_train, Y_train = [[] for i in range(5)], [[] for i in range(5)] with open("../train dataset/Tweet.csv", "rb") as f1, open("../train dataset/Target.csv", "rb") as f2, open("../train dataset/Stance.csv", "rb") as f3: for l1, l2, l3 in izip(f1, f2, f3): tweet = tknzr.tokenize(l1.strip()) x_train[target_dict[l2.strip()]].append(tweet) y_train[target_dict[l2.strip()]].append(l3.strip()) x_dev, y_dev = [[] for i in range(5)], [[] for i in range(5)] X_dev, Y_dev = [[] for i in range(5)], [[] for i in range(5)]
def process(text, tokenizer=TweetTokenizer(), stopwords=[]): text = text.lower() tokens = tokenizer.tokenize(text) return [word for word in tokens if word not in stopwords and not word.isdigit()]
def preprocessing(document_body): tokenizer = TweetTokenizer() token_list = tokenizer.tokenize(document_body) token_list = [str for str in token_list if str != '.'] return token_list
from keras import backend as K from keras.engine.topology import Layer, InputSpec from keras import initializers, optimizers import string import nltk nltk.download('stopwords') from nltk.corpus import stopwords from nltk.tokenize import TweetTokenizer from nltk import tokenize from sklearn.model_selection import StratifiedShuffleSplit from sklearn import preprocessing tknzr = TweetTokenizer() stop_set = set(stopwords.words('english') + list(string.punctuation)) def preprocessing_tweet(folder, category, all_text, all_time, all_freq, label, reviews): count = 0 for filename in os.listdir(folder): # if count > 10: # break # count +=1 sentences = [] time = [] freq = [] if os.path.isdir(os.path.join(folder, filename)): path = folder + '/' + filename + '/tweets.json' tweets_data = []
import enchant from nltk.stem.porter import * from nltk.tokenize import TweetTokenizer from nltk.corpus import stopwords t = TweetTokenizer() d = enchant.Dict("en_US") stemmer = PorterStemmer() stopword = set(stopwords.words('english')) def tokenize(text): ret = t.tokenize(text) return ret def stem(text): ret = [] for word in tokenize(text): word = word.lower() if not d.check(word): continue if word in stopword: continue word = stemmer.stem(word) ret += [word] return ret def similarity(candidate1, candidate2): set1 = set(stem(candidate1))
def __init__(self, max_dict_size=MM_MAX_DICT_SIZE): self.max_dict_size = max_dict_size self.token_to_id = {TOKEN_UNK: 0} self.next_id = 1 self.tokenizer = TweetTokenizer(preserve_case=True)
def isExtroverted(s): print(s) tempL = ['ESTP', 'ESTJ', 'ESFP', 'ESFJ', 'ENTP', 'ENTJ', 'ENFP', 'ENFJ'] ret = [] for i in s: if i in tempL: ret.append(True) else: ret.append(False) return ret fil = list(csv.reader(open('mbti_big5scores.csv'))) vocabFile = open('top500vocab.txt', 'r') tknzr = TweetTokenizer() #sentAn = SentimentIntensityAnalyzer() #sentiment analyzer lancaster = LancasterStemmer() #PorterStemmer() wordnetlem = WordNetLemmatizer() countVect = CountVectorizer() saver = tf.train.Saver() vocab = set() stopWords = set(stopwords.words('english')) features = {} textTrack = {} puncts = set(string.punctuation) #Neural network statistics dispEpoch = 2 saveEveryNEpochs = 5
# Import the necessary modules from nltk.tokenize import regexp_tokenize from nltk.tokenize import TweetTokenizer # Write a pattern that matches both mentions (@) and hashtags pattern2 = r"(@\w+|#\w+)" tweets.append('some of @my_story placed and some #tag') # Use the pattern on the last tweet in the tweets list mentions_hashtags = regexp_tokenize(tweets[-1], pattern2) print(mentions_hashtags) # Import the necessary modules from nltk.tokenize import regexp_tokenize from nltk.tokenize import TweetTokenizer # Use the TweetTokenizer to tokenize all tweets into one list tknzr = TweetTokenizer() all_tokens = [tknzr.tokenize(t) for t in tweets] print(all_tokens) german_text = 'Wann gehen wir Pizza essen? 🍕 Und fährst du mit Über? 🚕' # Tokenize and print all words in german_text all_words = word_tokenize(german_text) print(all_words) # # Tokenize and print only capital words capital_words = r"[A-Z\Ü]\w+" print(regexp_tokenize(german_text, capital_words)) # Tokenize and print only emoji emoji = "['\U0001F300-\U0001F5FF'|'\U0001F600-\U0001F64F'|'\U0001F680-\U0001F6FF'|'\u2600-\u26FF\u2700-\u27BF']" print(regexp_tokenize(german_text, emoji))
def represent_tweet(tweets ): tokens = TweetTokenizer().tokenize(tweets) frequency = defaultdict(int) for token in tokens: frequency[token] += 1 return frequency
# python library imports import string import re import nltk import spacy from nltk.stem import WordNetLemmatizer from nltk.tokenize import TweetTokenizer from nltk.corpus import stopwords nlp = spacy.load("en_core_web_sm") TKNZ = TweetTokenizer() LEM = WordNetLemmatizer() STOP_WORDS = stopwords.words('english') def filter_tweets(tweets_list, expr): output = [] try: # list of tweets for tweet in tweets_list: text = tweet['text'] found = re.search(expr, text) if found: output.append(tweet['text']) except: # list of tweet 'text' body for text in tweets_list: found = re.search(expr, text) if found: output.append(text)
# -*- coding: utf-8 -*- """ Created on Fri Sep 6 21:56:45 2019 @author: Lakshay Dhiman """ import matplotlib.pyplot as plt from nltk.tokenize import TweetTokenizer import pandas as pd from nltk.corpus import wordnet import random dataset = pd.read_csv('tweets-dataset.csv') x = dataset.iloc[:, :].values tknzr = TweetTokenizer(strip_handles=True, reduce_len=True) y = set() t = 0 for i in range(len(x)): p = tknzr.tokenize(x[i][0]) for j in range(len(p)): if (p[j] != '?' and p[j] != '!' and p[j] != '.' and p[j] != ','): y.add(p[j]) t = t + 1 z = list(y) p = [] for i in range(40000): h = random.randint(0, len(z) - 1) syn = [] for j in wordnet.synsets(z[h]): for k in j.lemmas(): if (z[h] == k.name()): syn.append(k.name())
class StreamProcess(KafkaConsumer): def __init__(self, *args, **kwargs): self.broker = kwargs['bootstrap_servers'] self._classifier_filepath = kwargs.pop('classifier_filepath', None) self.influxdb_host = kwargs.pop('influxdb_host', 'localhost') self.influxdb_port = kwargs.pop('influxdb_port', 8086) self.influxdb_database = kwargs.pop('influxdb_database', None) super().__init__(*args, **kwargs) self._stopwords = stopwords.words('english') with open(self._classifier_filepath, 'rb') as f: self._classifier = pickle.load(f) self._word_tokenizer = TweetTokenizer(preserve_case=True, reduce_len=False, strip_handles=False) self._lemmatizer = WordNetLemmatizer() self.influxdb_client = InfluxDBClient(host=self.influxdb_host, port=self.influxdb_port, username='******', password='******', database=self.influxdb_database) self.influxdb_client.create_database(self.influxdb_database) def process(self): try: message = self.__next__() tweet = message.value.decode('utf-8').strip() polarity = self._classify(tweet) wrapper = '+' if polarity == 'Positive' else '-' data_point = [{ # "timestamp": "measurement": "sentiments", "tags": { "language": "en", "polarity": polarity }, "fields": { "tweet": tweet } }] if self.influxdb_client.write_points(data_point): logging.info("DB SUCCESSFUL") else: logging.info("DB FAILED") # logging.info(message.offset) except StopIteration as e: logging.warning( "No incoming message found at Kafka broker: {}.".format( self.broker)) return def _tokenize(self, tweet): return self._word_tokenizer.tokenize(tweet) def _is_noise(self, word): pattern = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|(@[A-Za-z0-9_]+)' return word in string.punctuation \ or word.lower() in self._stopwords \ or re.search(pattern, word, re.IGNORECASE) != None def _tag2type(self, tag): """ Take a tag and return a type Common tags are: - NNP: Noun, proper, singular - NN: Noun, common, singular or mass - IN: Preposition or conjunction, subordinating - VBG: Verb, gerund or present participle - VBN: Verb, past participle return 'n' for noun, 'v' for verb, and 'a' for any """ if tag.startswith('NN'): return 'n' elif tag.startswith('VB'): return 'v' else: return 'a' def _lemmatize(self, tokens): return [ self._lemmatizer.lemmatize(word, self._tag2type(tag)).lower() for word, tag in pos_tag(tokens) if not self._is_noise(word) ] def _classify(self, tweet): tokens = self._lemmatize(self._tokenize(tweet)) return self._classifier.classify( dict([token, True] for token in tokens))
def demo_tweets(trainer, n_instances=None, output=None): """ Train and test Naive Bayes classifier on 10000 tweets, tokenized using TweetTokenizer. Features are composed of: - 1000 most frequent unigrams - 100 top bigrams (using BigramAssocMeasures.pmi) :param trainer: `train` method of a classifier. :param n_instances: the number of total tweets that have to be used for training and testing. Tweets will be equally split between positive and negative. :param output: the output file where results have to be reported. """ from nltk.tokenize import TweetTokenizer from nltk.sentiment import SentimentAnalyzer from nltk.corpus import twitter_samples, stopwords # Different customizations for the TweetTokenizer tokenizer = TweetTokenizer(preserve_case=False) # tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True) # tokenizer = TweetTokenizer(reduce_len=True, strip_handles=True) if n_instances is not None: n_instances = int(n_instances / 2) fields = ['id', 'text'] positive_json = twitter_samples.abspath("positive_tweets.json") positive_csv = 'positive_tweets.csv' json2csv_preprocess(positive_json, positive_csv, fields, limit=n_instances) negative_json = twitter_samples.abspath("negative_tweets.json") negative_csv = 'negative_tweets.csv' json2csv_preprocess(negative_json, negative_csv, fields, limit=n_instances) neg_docs = parse_tweets_set(negative_csv, label='neg', word_tokenizer=tokenizer) pos_docs = parse_tweets_set(positive_csv, label='pos', word_tokenizer=tokenizer) # We separately split subjective and objective instances to keep a balanced # uniform class distribution in both train and test sets. train_pos_docs, test_pos_docs = split_train_test(pos_docs) train_neg_docs, test_neg_docs = split_train_test(neg_docs) training_tweets = train_pos_docs + train_neg_docs testing_tweets = test_pos_docs + test_neg_docs sentim_analyzer = SentimentAnalyzer() # stopwords = stopwords.words('english') # all_words = [word for word in sentim_analyzer.all_words(training_tweets) if word.lower() not in stopwords] all_words = [word for word in sentim_analyzer.all_words(training_tweets)] # Add simple unigram word features unigram_feats = sentim_analyzer.unigram_word_feats(all_words, top_n=1000) sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) # Add bigram collocation features bigram_collocs_feats = sentim_analyzer.bigram_collocation_feats( [tweet[0] for tweet in training_tweets], top_n=100, min_freq=12) sentim_analyzer.add_feat_extractor(extract_bigram_feats, bigrams=bigram_collocs_feats) training_set = sentim_analyzer.apply_features(training_tweets) test_set = sentim_analyzer.apply_features(testing_tweets) classifier = sentim_analyzer.train(trainer, training_set) # classifier = sentim_analyzer.train(trainer, training_set, max_iter=4) try: classifier.show_most_informative_features() except AttributeError: print( 'Your classifier does not provide a show_most_informative_features() method.' ) results = sentim_analyzer.evaluate(test_set) if output: extr = [f.__name__ for f in sentim_analyzer.feat_extractors] output_markdown(output, Dataset='labeled_tweets', Classifier=type(classifier).__name__, Tokenizer=tokenizer.__class__.__name__, Feats=extr, Results=results, Instances=n_instances)
tweet2 = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet2) # remove hashtags # only removing the hash # sign from the word tweet2 = re.sub(r'#', '', tweet2) print(tweet2) # Tokenize the string print() print('\033[92m' + tweet2) print('\033[94m') # instantiate tokenizer class tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) # tokenize tweets tweet_tokens = tokenizer.tokenize(tweet2) print() print('Tokenized string:') print(tweet_tokens) # Remove stop words and punctuations # Import the english stop words list from NLTK stopwords_english = stopwords.words('english') print('Stop words\n') print(stopwords_english)
def get_tags(tweet): return [ word for word in TweetTokenizer().tokenize(tweet) if word.startswith('#') ]
df['preprocessed_tweet'] = df['preprocessed_tweet'].apply( lambda tw: remove_squarebrackets(tw)) df['preprocessed_tweet'] = df['preprocessed_tweet'].apply( lambda tw: remove_nonunicode(tw)) df['preprocessed_tweet'] = df['preprocessed_tweet'].apply( lambda tw: remove_symbols(tw)) df['preprocessed_tweet'] = df['preprocessed_tweet'].apply( lambda tw: tw.lower()) df['preprocessed_tweet'] = df['preprocessed_tweet'].apply( lambda tw: reduce_lengthening(tw)) # tokenization df['preprocessed_tweet'] = df['preprocessed_tweet'].apply( lambda tw: TweetTokenizer().tokenize(tw)) #df['preprocessed_tweet'] = df['preprocessed_tweet'].apply(lambda tw : spell_correction(tw)) df['preprocessed_tweet'] = df['preprocessed_tweet'].apply( lambda tw: remove_numbers(tw)) df['preprocessed_tweet'] = df['preprocessed_tweet'].apply( lambda tw: normalize_slangs(tw)) df['preprocessed_tweet'] = df['preprocessed_tweet'].apply( lambda tw: remove_stopwords(tw)) df['preprocessed_tweet'] = df['preprocessed_tweet'].apply( lambda tw: stemming(tw)) #df.to_csv("trainset_preprocessed.csv") """## Classification ### Stratified sampling
def tokenize(tweet): tknzr = TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case=False) return tknzr.tokenize(tweet)
def TokenWords(My_sent): tokenizer_words = TweetTokenizer() tokens_words = [tokenizer_words.tokenize(t) for t in My_sent] return tokens_words
db=conn[db_name] # collection colection = db.tweets # query: find all documents results = colection.find() # close the mongoDB connection conn.close() # convert the results to a list list_results=list(results) # print the time and the text for record in list_results: print 'At %s: \t %s.'% (record['time'],record['text']) # *** word frequency mining **** # tokenizer tweet_tokenizer = TweetTokenizer() # punctuation list punct = list(string.punctuation) # download 127 Englisg stop words import nltk nltk.download('stopwords') # list of stop words and punctuations stopword_list = stopwords.words('english') + punct + ['rt', 'via'] # record the number of occurences for each word tf = Counter() all_dates = [] # get the text and the time for element in list_results: message = element['text']
#unsupervised learning: #see if clusters exist between different types of hate speech #use tf-idf, dimensionality reduction, then clustering algorithm to separate #if clusters of hate speech don't exist, try and find other clusters with other datasets #priorities: #1: unsupervised learning and dimensionality reduction (try TSNE https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html) #2: PCA, cross-validation error, other classifiers and encodings #data processing from sklearn.feature_extraction.text import TfidfVectorizer import pandas as pd import numpy as np import nltk from nltk.tokenize import TweetTokenizer tknzr = TweetTokenizer(reduce_len=True) nltk.download('stopwords') from nltk.corpus import stopwords import string from joblib import dump, load import os.path from os import path #get pd dataframe for training data df_data = pd.read_csv("twitter-sentiment-analysis-hatred-speech/train.csv", names=('id', 'label', 'tweet'), header=None) def processTweet(tweet): tokens = tknzr.tokenize(tweet[0:-1])