def extractFeatureVocab(tweets, keyword="all", usephrasemodel=True, phrasemodel="phrase.model", anon_targets=False): tokencounts = Counter() features_final = [] bigram = Phrases(phrasemodel) #tokens_topic = [] #if keyword == "all": # for top in tokenize_tweets.TOPICS: # if top != 'clinton': # for tok in tokenize(tokenize_tweets.TOPICS_LONG[top]): # tokens_topic.append(tok) #else: # tokens_topic = tokenize(tokenize_tweets.TOPICS_LONG[keyword]) for tweet in tweets: if usephrasemodel == False: tokenised_tweet = tokenize(tweet) for token in tokenised_tweet: #unigram features tokencounts[token] += 1 #for toktopic in tokens_topic: # tokencounts[toktopic + '|' + token] += 1 for l in zip(*[tokenised_tweet[i:] for i in range(2)]): #bigram features tokencounts["_".join(l)] += 1 #for ltop in zip(*[tokens_topic[i:] for i in range(2)]): # tokencounts["_".join(ltop) + '|' + "_".join(l)] += 1 else: # this includes unigrams and frequent bigrams tokens = filterStopwords(tokenize(tweet.lower())) #For Trump it's [1] phrasetoks = bigram[tokens] target_keywords = [] if anon_targets==True: for top in tokenize_tweets.TOPICS: if top == "climate": # hack, this is the only non-list value target_keywords.append("climate") else: #for keyw in tokenize_tweets.KEYWORDS[top]: target_keywords.extend(tokenize_tweets.KEYWORDS[top]) phrasetoks_new = [] for token in phrasetoks: for keyw in target_keywords: if keyw in token: token = token.replace(keyw, "TARGET") phrasetoks_new.append(token) phrasetoks = phrasetoks_new for token in phrasetoks: tokencounts[token] += 1 for l in zip(*[phrasetoks[i:] for i in range(2)]): tokencounts["_".join(l)] += 1 for token, count in tokencounts.most_common(): if count > 1: features_final.append(token) #print token, count return features_final
def findTokensPhrases(phrasemodel="phrase.model", useDev=False): tokencnt = Counter() bigram = Phrases(phrasemodel) twcntr = 0 supercntr = 0 trumpcntr = 0 for line in open(INPUT, 'r'): twcntr += 1 tokenised = tokenize(json.loads(line)['text'].lower()) tokens = filterStopwords(tokenised) # filter stopwords for token in bigram[tokens]: # calling the phrase model, this leaves some as single tokens and feq occurring ones as bigrams tokencnt[token] += 1 for line in io.open(tokenize_tweets.FILETRAIN, encoding='windows-1252', mode='r'): #for the Trump file it's utf-8 if line.startswith('ID\t'): continue tokens = filterStopwords(tokenize(line.split("\t")[2].lower())) #For Trump it's [1] for token in bigram[tokens]: supercntr += 1 tokencnt[token] += 1 if useDev == True: for line in io.open(tokenize_tweets.FILEDEV, encoding='windows-1252', mode='r'): #for the Trump file it's utf-8 if line.startswith('ID\t'): continue tokens = filterStopwords(tokenize(line.split("\t")[2].lower())) #For Trump it's [1] for token in bigram[tokens]: supercntr += 1 tokencnt[token] += 1 for line in io.open(tokenize_tweets.FILETRUMP, encoding='utf-8', mode='r'): #for the Trump file it's utf-8 if line.startswith('ID\t'): continue tokens = filterStopwords(tokenize(line.split("\t")[1].lower())) #For Trump it's [1] for token in bigram[tokens]: trumpcntr += 1 tokencnt[token] += 1 output = open(OUTPUT, "wb") tokens_pb = Tokens() for token, count in tokencnt.most_common(): if count > 1: # not even worth saving singletons token_pb = tokens_pb.tokens.add() token_pb.token = token token_pb.count = count print "Saving token counts for ", tokencnt.__sizeof__(), ". ", twcntr, " unlabelled tweets, ", trumpcntr, " Donald Trump tweets, ", supercntr, " labelled tweets" output.write(tokens_pb.SerializeToString()) output.close
def extractFeaturesBOW(tweets, targets, features_final, anon_targets=False, usephrasemodel=False, phrasemodel="phrase.model"): bigram = Phrases(phrasemodel) matrix = [] # np.zeros((len(features_final), len(tweets))) for i, tweet in enumerate(tweets): vect = np.zeros((len(features_final))) if usephrasemodel == False: tokenised_tweet = tokenize(tweet) for token in tokenised_tweet: insertIntoVect(features_final, vect, token) #for toktopic in tokens_topic: # insertIntoVect(features_final, vect, toktopic + '|' + token) for l in zip(*[tokenised_tweet[i:] for i in range(2)]): insertIntoVect(features_final, vect, "_".join(l)) #for ltop in zip(*[tokens_topic[i:] for i in range(2)]): # insertIntoVect(features_final, vect, "_".join(ltop) + '|' + "_".join(l)) else: inv_topics = {v: k for k, v in tokenize_tweets.TOPICS_LONG.items()} target_keywords = tokenize_tweets.KEYWORDS.get( inv_topics.get(targets[i])) tokens = filterStopwords(tokenize( tweet.lower())) #For Trump it's [1] phrasetoks = bigram[tokens] if anon_targets == True: phrasetoks_new = [] for token in phrasetoks: if target_keywords == "climate": if target_keywords in token: token = token.replace(keyw, "TARGET") else: for keyw in target_keywords: if keyw in token: token = token.replace(keyw, "TARGET") phrasetoks_new.append(token) phrasetoks = phrasetoks_new for token in phrasetoks: insertIntoVect(features_final, vect, token) for l in zip(*[phrasetoks[i:] for i in range(2)]): insertIntoVect(features_final, vect, "_".join(l)) matrix.append(vect) #print " ".join(str(v) for v in vect), "\n" return matrix
def extractFeaturesBOW(tweets, targets, features_final, anon_targets=False, usephrasemodel=False, phrasemodel="phrase.model"): bigram = Phrases(phrasemodel) matrix = [] # np.zeros((len(features_final), len(tweets))) for i, tweet in enumerate(tweets): vect = np.zeros((len(features_final))) if usephrasemodel == False: tokenised_tweet = tokenize(tweet) for token in tokenised_tweet: insertIntoVect(features_final, vect, token) #for toktopic in tokens_topic: # insertIntoVect(features_final, vect, toktopic + '|' + token) for l in zip(*[tokenised_tweet[i:] for i in range(2)]): insertIntoVect(features_final, vect, "_".join(l)) #for ltop in zip(*[tokens_topic[i:] for i in range(2)]): # insertIntoVect(features_final, vect, "_".join(ltop) + '|' + "_".join(l)) else: inv_topics = {v: k for k, v in tokenize_tweets.TOPICS_LONG.items()} target_keywords = tokenize_tweets.KEYWORDS.get(inv_topics.get(targets[i])) tokens = filterStopwords(tokenize(tweet.lower())) #For Trump it's [1] phrasetoks = bigram[tokens] if anon_targets==True: phrasetoks_new = [] for token in phrasetoks: if target_keywords == "climate": if target_keywords in token: token = token.replace(keyw, "TARGET") else: for keyw in target_keywords: if keyw in token: token = token.replace(keyw, "TARGET") phrasetoks_new.append(token) phrasetoks = phrasetoks_new for token in phrasetoks: insertIntoVect(features_final, vect, token) for l in zip(*[phrasetoks[i:] for i in range(2)]): insertIntoVect(features_final, vect, "_".join(l)) matrix.append(vect) #print " ".join(str(v) for v in vect), "\n" return matrix
def countHashTags(tweets, labels): neut = Counter() neg = Counter() pos = Counter() all = Counter() for it, tweet in enumerate(tweets): tokenised_tweet = tokenize(tweet) label = labels[it] for token in tokenised_tweet: if token.startswith("#"): all[token] += 1 if label == "NONE": neut[token] += 1 elif label == "AGAINST": neg[token] += 1 elif label == "FAVOR": pos[token] += 1 print("Hashtags\tAll\tNeut\tNeg\tPos") for token, count in all.most_common(): neutrcnt, poscnt, negcnt = 0, 0, 0 if neut.__contains__(token): neutrcnt = neut[token] if neg.__contains__(token): negcnt = neg[token] if pos.__contains__(token): poscnt = pos[token] print(token, "\t", count, "\t", neutrcnt, "\t", negcnt, "\t", poscnt)
def writeToksToFile(): tokens,tweets_on_topic,tweets = readToks() for topic in TOPICS: tokenized_tweets = Tweets() for index in tweets_on_topic[topic]: tweet = tweets[index] tokenized = tokenized_tweets.tweets.add() tokenized.tweet = tweet['text'] for token in tokenize(tweet['text']): try: index = tokens.index(token) tokenized.tokens.append(index) except ValueError: tokenized.tokens.append(-1) print(tokenized.tokens) f = open(topic + '.tweets', "wb") f.write(tokenized_tweets.SerializeToString()) f.close()
def countHashTags(tweets, labels): neut = Counter() neg = Counter() pos = Counter() all = Counter() for it, tweet in enumerate(tweets): tokenised_tweet = tokenize(tweet) label = labels[it] for token in tokenised_tweet: if token.startswith("#"): all[token] += 1 if label == "NONE": neut[token] += 1 elif label == "AGAINST": neg[token] += 1 elif label == "FAVOR": pos[token] += 1 print "Hashtags\tAll\tNeut\tNeg\tPos" for token, count in all.most_common(): neutrcnt, poscnt, negcnt = 0, 0, 0 if neut.__contains__(token): neutrcnt = neut[token] if neg.__contains__(token): negcnt = neg[token] if pos.__contains__(token): poscnt = pos[token] print token, "\t", count, "\t", neutrcnt, "\t", negcnt, "\t", poscnt
def prepData(stopfilter, multiword, useDev=False): print("Preparing data...") ret = [] # list of lists print("Reading data...") tweets = readTweets() tweets_train, targets_train, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2) tweets_trump, targets_trump, labels_trump = readTweetsOfficial(tokenize_tweets.FILETRUMP, 'utf-8', 1) print(str(len(tweets))) tweets.extend(tweets_train) print(str(len(tweets_train)), "\t" , str(len(tweets))) tweets.extend(tweets_trump) print(str(len(tweets_trump)), "\t" , str(len(tweets))) if useDev == True: tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2) tweets.extend(tweets_dev) print(str(len(tweets_dev)), "\t" , str(len(tweets))) print("Tokenising...") for tweet in tweets: tokenised_tweet = tokenize(tweet.lower()) if stopfilter: words = filterStopwords(tokenised_tweet) ret.append(words) else: ret.append(tokenised_tweet) if multiword: return learnMultiword(ret) else: return ret
def prepData(filepath, stopfilter, multiword): print("Preparing data...") ret = [] # list of lists print("Reading data...") # this reads file in JSON format #tweets = readTweets(jsonfilepath) # this reads SemEval format tweets tweets, _, _, _ = readTweetsOfficial(filepath) #tweets = "\n".join(tweets) print("Tokenising...") for tweet in tweets: tokenised_tweet = tokenize(tweet.lower()) if stopfilter: words = filterStopwords(tokenised_tweet) ret.append(words) else: ret.append(tokenised_tweet) if multiword: return learnMultiword(ret) else: return ret
def extractW2VAggrFeatures(w2vmodel, phrasemodel, tweets, targets, labels): feats = [] # for each tweet, multiply the word vectors for i, tweet in enumerate(tweets): tokenised_tweet = tokenize(tweet.lower()) words = filterStopwords(tokenised_tweet) numvects = 0 vect = [] for token in phrasemodel[words]: try: s = w2vmodel[token] vect.append(s) numvects += 1 except KeyError: s = 0.0 if vect.__len__() > 0: mtrmean = np.average(vect, axis=0) if i == 0: feats = mtrmean else: feats = np.vstack((feats, mtrmean)) else: feats = np.vstack( (feats, np.zeros(300))) # 300-dimensional vector for now return feats
def prepData(stopfilter, multiword, useDev=False): print("Preparing data...") ret = [] # list of lists print("Reading data...") tweets = readTweets() tweets_train, targets_train, labels_train = readTweetsOfficial( tokenize_tweets.FILETRAIN, 'windows-1252', 2) tweets_trump, targets_trump, labels_trump = readTweetsOfficial( tokenize_tweets.FILETRUMP, 'utf-8', 1) print(str(len(tweets))) tweets.extend(tweets_train) print(str(len(tweets_train)), "\t", str(len(tweets))) tweets.extend(tweets_trump) print(str(len(tweets_trump)), "\t", str(len(tweets))) if useDev == True: tweets_dev, targets_dev, labels_dev = readTweetsOfficial( tokenize_tweets.FILEDEV, 'windows-1252', 2) tweets.extend(tweets_dev) print(str(len(tweets_dev)), "\t", str(len(tweets))) print("Tokenising...") for tweet in tweets: tokenised_tweet = tokenize(tweet.lower()) if stopfilter: words = filterStopwords(tokenised_tweet) ret.append(words) else: ret.append(tokenised_tweet) if multiword: return learnMultiword(ret) else: return ret
def extractW2VAggrFeatures(w2vmodel, phrasemodel, tweets, targets, labels): feats = [] # for each tweet, multiply the word vectors for i, tweet in enumerate(tweets): tokenised_tweet = tokenize(tweet.lower()) words = filterStopwords(tokenised_tweet) numvects = 0 vect = [] for token in phrasemodel[words]: try: s = w2vmodel[token] vect.append(s) numvects += 1 except KeyError: s = 0.0 if vect.__len__() > 0: mtrmean = np.average(vect, axis=0) if i == 0: feats = mtrmean else: feats = np.vstack((feats, mtrmean)) else: feats = np.vstack((feats, np.zeros(300))) # 300-dimensional vector for now return feats
def writeToksToFile(): tokens, tweets_on_topic, tweets = readToks() for topic in TOPICS: tokenized_tweets = Tweets() for index in tweets_on_topic[topic]: tweet = tweets[index] tokenized = tokenized_tweets.tweets.add() tokenized.tweet = tweet['text'] for token in tokenize(tweet['text']): try: index = tokens.index(token) tokenized.tokens.append(index) except ValueError: tokenized.tokens.append(-1) print(tokenized.tokens) f = open(topic + '.tweets', "wb") f.write(tokenized_tweets.SerializeToString()) f.close()
def main(tweet_fp): pos_tagger = mallet_wrapper.MalletPOSTagger(_MODEL_FP, _TOKEN2POS_MAPS, _TOKEN_MAPS, _BIGRAM, _TEMP_DIR) tweet_tokens_list = [] sys.stderr.write('Creating mallet test file.\n') for line in open(tweet_fp): tweet_tokens_list.append(twokenize_wrapper.tokenize(line.rstrip('\n'))) return pos_tagger.pos_tag_tweets(tweet_tokens_list)
def convertTweetsOfficialToVec(numtoks, tokens, tweets, filtering=False, phrasemodelpath="phrase.model"): tokens_sub = tokens[:numtoks] tokenized_tweets = Tweets() vects = [] norm_tweets = [] if filtering == True: bigram = Phrases(phrasemodelpath) for tweet in tweets: vect = np.zeros( numtoks ) # dimensionality. the most frequent tokens have a low index, then we can do a cutoff. original: 93988 norm_tweet = [] tokenized = tokenized_tweets.tweets.add() tokenized.tweet = tweet if filtering == False: tokenised_tweet = tokenize(tokenized.tweet) else: tokens = filterStopwords(tokenize(tokenized.tweet.lower())) tokenised_tweet = bigram[tokens] for token in tokenised_tweet: try: index = tokens_sub.index(token) except ValueError: index = -1 if index > -1: vect[index] = 1 norm_tweet.append(token) else: norm_tweet.append('NULL') #print(norm_tweet) norm_tweets.append(norm_tweet) vects.append(vect) return vects, norm_tweets
def findTokensAll(): tokens = Counter() twcntr = 0 supercntr = 0 trumpcntr = 0 for line in open(INPUT, 'r'): twcntr += 1 for token in tokenize(json.loads(line)['text']): tokens[token] += 1 for line in io.open(tokenize_tweets.FILETRAIN, encoding='windows-1252', mode='r'): #for the Trump file it's utf-8 if line.startswith('ID\t'): continue for token in tokenize(line.split("\t")[2]): #For Trump it's [1] supercntr += 1 tokens[token] += 1 for line in io.open(tokenize_tweets.FILETRUMP, encoding='utf-8', mode='r'): #for the Trump file it's utf-8 if line.startswith('ID\t'): continue for token in tokenize(line.split("\t")[1]): #For Trump it's [1] trumpcntr += 1 tokens[token] += 1 output = open(OUTPUT, "wb") tokens_pb = Tokens() for token, count in tokens.most_common(): if count > 1: # not even worth saving singletons token_pb = tokens_pb.tokens.add() token_pb.token = token token_pb.count = count print("Saving token counts for ", tokens.__sizeof__(), ". ", twcntr, " unlabelled tweets, ", trumpcntr, " Donald Trump tweets, ", supercntr, " labelled tweets") output.write(tokens_pb.SerializeToString()) output.close
def cleanHelper(self, body): tokens = tokenize(body) tokens = [x.lower().strip() for x in tokens] tokens = [x for x in tokens if emoticons(x) == "NA"] tokens = [x.strip(" #\-*!._(){}~,^") for x in tokens] tokens = [self.normalizer.replace(x) for x in tokens] tokens = [x for x in tokens if re.search("\w", x)] body = " ".join(tokens) return tokens, body
def findTokensAll(): tokens = Counter() twcntr = 0 supercntr = 0 trumpcntr = 0 for line in open(INPUT, 'r'): twcntr += 1 for token in tokenize(json.loads(line)['text']): tokens[token] += 1 for line in io.open(tokenize_tweets.FILETRAIN, encoding='windows-1252', mode='r'): #for the Trump file it's utf-8 if line.startswith('ID\t'): continue for token in tokenize(line.split("\t")[2]): #For Trump it's [1] supercntr += 1 tokens[token] += 1 for line in io.open(tokenize_tweets.FILETRUMP, encoding='utf-8', mode='r'): #for the Trump file it's utf-8 if line.startswith('ID\t'): continue for token in tokenize(line.split("\t")[1]): #For Trump it's [1] trumpcntr += 1 tokens[token] += 1 output = open(OUTPUT, "wb") tokens_pb = Tokens() for token, count in tokens.most_common(): if count > 1: # not even worth saving singletons token_pb = tokens_pb.tokens.add() token_pb.token = token token_pb.count = count print "Saving token counts for ", tokens.__sizeof__(), ". ", twcntr, " unlabelled tweets, ", trumpcntr, " Donald Trump tweets, ", supercntr, " labelled tweets" output.write(tokens_pb.SerializeToString()) output.close
def convertTweetsOfficialToVec(numtoks, tokens, tweets, filtering=False, phrasemodelpath="phrase.model"): tokens_sub = tokens[:numtoks] tokenized_tweets = Tweets() vects = [] norm_tweets = [] if filtering==True: bigram = Phrases(phrasemodelpath) for tweet in tweets: vect = np.zeros(numtoks) # dimensionality. the most frequent tokens have a low index, then we can do a cutoff. original: 93988 norm_tweet = [] tokenized = tokenized_tweets.tweets.add() tokenized.tweet = tweet if filtering == False: tokenised_tweet = tokenize(tokenized.tweet) else: tokens = filterStopwords(tokenize(tokenized.tweet.lower())) tokenised_tweet = bigram[tokens] for token in tokenised_tweet: try: index = tokens_sub.index(token) except ValueError: index = -1 if index > -1: vect[index] = 1 norm_tweet.append(token) else: norm_tweet.append('NULL') #print(norm_tweet) norm_tweets.append(norm_tweet) vects.append(vect) return vects,norm_tweets
def thinposts(lines): for line in lines: m = re.search(subreddit_re, line) if not m: continue comment = json.loads(line) if comment['body'] == '[deleted]': continue if comment['subreddit'].lower() in subreddits: reformed_text = ' '.join(twokenize_wrapper.tokenize(comment['body'])) yield reformed_text.strip() + ' <EOS>'
def thinposts(lines): posts = [] for line in lines: # if not re.search(subreddit_re, line): # continue comment = json.loads(line) if comment['text'] == '[deleted]': continue if comment['community'][1]['name'].lower() in subreddits: tokens = twokenize_wrapper.tokenize(comment['text'].strip()) yield ' '.join(tokens) + ' <EOS> '
def extractW2VFeaturesSim(w2vmodelfile, phrasemodel, tweets, targets, labels): phmodel = Phrases.load(phrasemodel) w2vmodel = word2vec.Word2Vec.load(w2vmodelfile) inv_topics = {v: k for k, v in tokenize_tweets.TOPICS_LONG.items()} for i, tweet in enumerate(tweets): # get the neut/pos/neg hashtags neut = KEYWORDS_NEUT[inv_topics[targets[i]]] pos = KEYWORDS_POS[inv_topics[targets[i]]] neg = KEYWORDS_NEG[inv_topics[targets[i]]] tokenised_tweet = tokenize(tweet.lower()) words = filterStopwords(tokenised_tweet) neutcnt, poscnt, negcnt = 0, 0, 0 neutsc, possc, negsc = 0.0, 0.0, 0.0 # transform, as earlier, with the phrase model for token in phmodel[words]: try: neutsim = w2vmodel.similarity(neut, token) neutcnt += 1 neutsc += neutsim except KeyError: neutsim = 0 try: possim = w2vmodel.similarity(pos, token) possc += possim poscnt += 1 except KeyError: possim = 0 try: negsim = w2vmodel.similarity(neg, token) negsc += negsim negcnt += 1 except KeyError: negsim = 0 #print targets[i], "\t", token, "\t", neutsim, "\t", possim, "\t", negsim neutsc_tweet = neutsc/neutcnt possc_tweet = possc/poscnt negsc_tweet = negsc/negcnt print(targets[i], "\t", labels[i], "\t", neutsc_tweet, "\t", possc_tweet, "\t", negsc_tweet)
def findTokensJson(): tokens = Counter() for line in open(INPUT, 'r'): for token in tokenize(json.loads(line)['text']): tokens[token] += 1 output = open(OUTPUT, "wb") tokens_pb = Tokens() for token, count in tokens.most_common(): token_pb = tokens_pb.tokens.add() token_pb.token = token token_pb.count = count output.write(tokens_pb.SerializeToString()) output.close
def extractW2VFeaturesSim(w2vmodelfile, phrasemodel, tweets, targets, labels): phmodel = Phrases.load(phrasemodel) w2vmodel = word2vec.Word2Vec.load(w2vmodelfile) inv_topics = {v: k for k, v in tokenize_tweets.TOPICS_LONG.items()} for i, tweet in enumerate(tweets): # get the neut/pos/neg hashtags neut = KEYWORDS_NEUT[inv_topics[targets[i]]] pos = KEYWORDS_POS[inv_topics[targets[i]]] neg = KEYWORDS_NEG[inv_topics[targets[i]]] tokenised_tweet = tokenize(tweet.lower()) words = filterStopwords(tokenised_tweet) neutcnt, poscnt, negcnt = 0, 0, 0 neutsc, possc, negsc = 0.0, 0.0, 0.0 # transform, as earlier, with the phrase model for token in phmodel[words]: try: neutsim = w2vmodel.similarity(neut, token) neutcnt += 1 neutsc += neutsim except KeyError: neutsim = 0 try: possim = w2vmodel.similarity(pos, token) possc += possim poscnt += 1 except KeyError: possim = 0 try: negsim = w2vmodel.similarity(neg, token) negsc += negsim negcnt += 1 except KeyError: negsim = 0 #print targets[i], "\t", token, "\t", neutsim, "\t", possim, "\t", negsim neutsc_tweet = neutsc / neutcnt possc_tweet = possc / poscnt negsc_tweet = negsc / negcnt print(targets[i], "\t", labels[i], "\t", neutsc_tweet, "\t", possc_tweet, "\t", negsc_tweet)
def findTokensOfficial(): tokens = Counter() for line in io.open(INPUT, encoding='windows-1252', mode='r'): #for the Trump file it's utf-8 if line.startswith('ID\t'): continue for token in tokenize(line.split("\t")[2]): #For Trump it's [1] tokens[token] += 1 output = open(OUTPUT, "wb") tokens_pb = Tokens() for token, count in tokens.most_common(): token_pb = tokens_pb.tokens.add() token_pb.token = token token_pb.count = count output.write(tokens_pb.SerializeToString()) output.close
def extractFeaturesCrossTweetTarget(tweets, targets): ret = [] inv_topics = {v: k for k, v in tokenize_tweets.TOPICS_LONG.items()} #TOPICS = inv_topics.keys() for i, tweet in enumerate(tweets): tokenised_tweet = tokenize(tweet) target_keywords = tokenize_tweets.KEYWORDS.get(inv_topics.get(targets[i])) target_in_tweet = 0 for key in target_keywords: if tweet.__contains__(key): target_in_tweet = 1 break # option below cares for tokenisation, but since hashtags are not tokenised at the moment, the above works better #for tweettok in tokenised_tweet: # if tweettok in target_keywords: # target_in_tweet = 1 # break ret.append(target_in_tweet) return ret
def extractFeaturesCrossTweetTarget(tweets, targets): ret = [] inv_topics = {v: k for k, v in tokenize_tweets.TOPICS_LONG.items()} #TOPICS = inv_topics.keys() for i, tweet in enumerate(tweets): tokenised_tweet = tokenize(tweet) target_keywords = tokenize_tweets.KEYWORDS.get( inv_topics.get(targets[i])) target_in_tweet = 0 for key in target_keywords: if tweet.__contains__(key): target_in_tweet = 1 break # option below cares for tokenisation, but since hashtags are not tokenised at the moment, the above works better #for tweettok in tokenised_tweet: # if tweettok in target_keywords: # target_in_tweet = 1 # break ret.append(target_in_tweet) return ret
def thinposts(lines): posts = [] for line in lines: # if not re.search(subreddit_re, line): # continue comment = json.loads(line) if comment['text'] == '[deleted]': continue if comment['community'][1]['name'].lower() in subreddits: out_comment = {} tokens = twokenize_wrapper.tokenize(comment['text']) out_comment['body'] = comment['text'] out_comment['subreddit'] = comment['community'][1]['name'] out_comment['author'] = comment['user']['username'] out_comment['created_utc'] = str(int(comment['createdAt']) / 1000) out_comment['tokens'] = tokens out_comment['id'] = 'x' yield(json.dumps(out_comment))
def main(argv): import nltk import random, re import twokenize_wrapper as tok import pickle stopWords = {} st = open('stopWordsNew.txt', 'r') inputFile = open(argv[0],'r') outputFile = open(argv[1],'w') maxEntObjectFile = open('maxEntObject.pkl','rb') for line in st: line = line.strip('\n') if(not stopWords.has_key(line)): stopWords[line] = 1 def featureFunc(tweet): feat = {} for word in tweet: feat[word] = 1 return feat wnl = nltk.stem.WordNetLemmatizer() wordListMap = {} tokenizedTweets = [] totalTweets = 0 for line in inputFile: tweet = line.strip('\n') tweet = tweet.lower() tweet = re.sub(r'#([^\s]+)', r'\1', tweet) #HASH TAG tweet = re.sub(r'(@[\w]+)','_HANDLE_',tweet) #HANDLE tweet = re.sub(r'http[\w://.~?=%&-]+','_URL_',tweet) #URL tweet = re.sub(r'(/|:|&|\(|\))',' ',tweet) # / : & ( ) spaced tweet = re.sub(r'(\d+)',r' \1 ',tweet) # Digit clusters spaced tweet = re.sub(r'(\w+)(-|;)(\w+)',r'\1 \3',tweet) # words(-|;)word separated tokens = tok.tokenize(tweet) pattern = re.compile(r"(.)\1{2,}", re.DOTALL) # hunggggryy -> hungryy newTokens = [] flag = 0 for word in tokens: word = pattern.sub(r"\1", word) # word = word.strip('\'"?,.!') word = word.strip('.,();-*~[]_=|+%') word = re.sub(r'(\w+)[..|.](\w+)',r'\1 \2',word) newWord = word.split() for word in newWord: word = wnl.lemmatize(word) # if(stopWords.has_key(word) or word == ''or word.isdigit())): if(stopWords.has_key(word) or word == '' or word.isdigit() or word=='\''): continue else: if(flag == 1): word = "NOT_" + word flag = 0 if(word == "n't" and flag == 0): flag = 1 word = "not" newTokens.append(word) for word in newTokens: # str = str + word + ' ' if(not wordListMap.has_key(word)): wordListMap[word] = 1 if(len(newTokens)>0): totalTweets = totalTweets + 1 tokenizedTweets.append(newTokens) classifier = pickle.load(maxEntObjectFile) for i in range(0,len(tokenizedTweets)): testTweet = tokenizedTweets[i] pred = classifier.classify(featureFunc(testTweet)) outputFile.write(str(pred)+"\n") return
i += 1 dictMap = {} i = 1 for line in open('%s/hbc/data/dictionaries' % (BASE_DIR)): dictionary = line.rstrip('\n') dictMap[i] = dictionary i += 1 dict2label = {} for line in open('%s/hbc/data/dict-label3' % (BASE_DIR)): (dictionary, label) = line.rstrip('\n').split(' ') dict2label[dictionary] = label nLines = 1 for line in sys.stdin: line = line.rstrip('\n') words = twokenize_wrapper.tokenize(line) seq_features = [] tags = [] goodCap = capClassifier.Classify(words) > 0.9 # POS Tagging the tweet if posTagger: pos = posTagger.TagSentence(words) pos = [p.split(':')[0] for p in pos] # remove weights else: pos = fields[-1].split(' ') # Chunking the tweet if chunkTagger: word_pos = zip(words, [p.split(':')[0] for p in pos])
def findTokensPhrases(phrasemodel="phrase.model", useDev=False): tokencnt = Counter() bigram = Phrases(phrasemodel) twcntr = 0 supercntr = 0 trumpcntr = 0 for line in open(INPUT, 'r'): twcntr += 1 tokenised = tokenize(json.loads(line)['text'].lower()) tokens = filterStopwords(tokenised) # filter stopwords for token in bigram[ tokens]: # calling the phrase model, this leaves some as single tokens and feq occurring ones as bigrams tokencnt[token] += 1 for line in io.open(tokenize_tweets.FILETRAIN, encoding='windows-1252', mode='r'): #for the Trump file it's utf-8 if line.startswith('ID\t'): continue tokens = filterStopwords(tokenize( line.split("\t")[2].lower())) #For Trump it's [1] for token in bigram[tokens]: supercntr += 1 tokencnt[token] += 1 if useDev == True: for line in io.open(tokenize_tweets.FILEDEV, encoding='windows-1252', mode='r'): #for the Trump file it's utf-8 if line.startswith('ID\t'): continue tokens = filterStopwords(tokenize( line.split("\t")[2].lower())) #For Trump it's [1] for token in bigram[tokens]: supercntr += 1 tokencnt[token] += 1 for line in io.open(tokenize_tweets.FILETRUMP, encoding='utf-8', mode='r'): #for the Trump file it's utf-8 if line.startswith('ID\t'): continue tokens = filterStopwords(tokenize( line.split("\t")[1].lower())) #For Trump it's [1] for token in bigram[tokens]: trumpcntr += 1 tokencnt[token] += 1 output = open(OUTPUT, "wb") tokens_pb = Tokens() for token, count in tokencnt.most_common(): if count > 1: # not even worth saving singletons token_pb = tokens_pb.tokens.add() token_pb.token = token token_pb.count = count print "Saving token counts for ", tokencnt.__sizeof__( ), ". ", twcntr, " unlabelled tweets, ", trumpcntr, " Donald Trump tweets, ", supercntr, " labelled tweets" output.write(tokens_pb.SerializeToString()) output.close
def process_text(self, sentence): sentence = sentence.strip() sentence = re.sub(self.open_a, '', sentence) sentence = re.sub(self.close_a, '', sentence) return tokenize(sentence.lower())
def convertTweetsToVec(topic="all", numtoks='all', phrasemodel=False, phrasemodelpath="phrase.model"): print("Reading tokens") tokens,tweets_on_topic,tweets = readToks(phrasemodel) if phrasemodel==True: bigram = Phrases(phrasemodelpath) if numtoks != "all": tokens_sub = tokens[:numtoks] else: tokens_sub = tokens numtoks = tokens.__sizeof__() tokenized_tweets = Tweets() vects = [] norm_tweets = [] print("Converting JSON tweets") if topic=='all': #for topic in TOPICS: for tweet in tweets: vect = np.zeros(numtoks, dtype=bool) # dimensionality. the most frequent tokens have a low index, then we can do a cutoff. original: 93988 norm_tweet = [] tokenized = tokenized_tweets.tweets.add() tokenized.tweet = tweet['text'] if phrasemodel == False: tokenised_tweet = tokenize(tweet['text']) else: tokens = filterStopwords(tokenize(tweet['text'].lower())) tokenised_tweet = bigram[tokens] for token in tokenised_tweet: try: index = tokens_sub.index(token) except ValueError: index = -1 if index > -1: vect[index] = 1 norm_tweet.append(token) else: norm_tweet.append('NULL') #print(norm_tweet) norm_tweets.append(norm_tweet) vects.append(vect) else: # discouraged, needs to be updated for index in tweets_on_topic[topic]: tweet = tweets[index] vect = np.zeros(numtoks) # dimensionality. the most frequent tokens have a low index, then we can do a cutoff. original: 93988 norm_tweet = [] tokenized = tokenized_tweets.tweets.add() tokenized.tweet = tweet['text'] for token in tokenize(tweet['text']): try: index = tokens_sub.index(token) except ValueError: index = -1 if index > -1: vect[index] = 1 norm_tweet.append(token) else: norm_tweet.append('NULL') print(norm_tweet) norm_tweets.append(norm_tweet) vects.append(vect) print("Finished converting JSON tweets") return tokens_sub,vects,norm_tweets
def extractFeatureVocab(tweets, keyword="all", usephrasemodel=True, phrasemodel="phrase.model", anon_targets=False): tokencounts = Counter() features_final = [] bigram = Phrases(phrasemodel) #tokens_topic = [] #if keyword == "all": # for top in tokenize_tweets.TOPICS: # if top != 'clinton': # for tok in tokenize(tokenize_tweets.TOPICS_LONG[top]): # tokens_topic.append(tok) #else: # tokens_topic = tokenize(tokenize_tweets.TOPICS_LONG[keyword]) for tweet in tweets: if usephrasemodel == False: tokenised_tweet = tokenize(tweet) for token in tokenised_tweet: #unigram features tokencounts[token] += 1 #for toktopic in tokens_topic: # tokencounts[toktopic + '|' + token] += 1 for l in zip(*[tokenised_tweet[i:] for i in range(2)]): #bigram features tokencounts["_".join(l)] += 1 #for ltop in zip(*[tokens_topic[i:] for i in range(2)]): # tokencounts["_".join(ltop) + '|' + "_".join(l)] += 1 else: # this includes unigrams and frequent bigrams tokens = filterStopwords(tokenize( tweet.lower())) #For Trump it's [1] phrasetoks = bigram[tokens] target_keywords = [] if anon_targets == True: for top in tokenize_tweets.TOPICS: if top == "climate": # hack, this is the only non-list value target_keywords.append("climate") else: #for keyw in tokenize_tweets.KEYWORDS[top]: target_keywords.extend(tokenize_tweets.KEYWORDS[top]) phrasetoks_new = [] for token in phrasetoks: for keyw in target_keywords: if keyw in token: token = token.replace(keyw, "TARGET") phrasetoks_new.append(token) phrasetoks = phrasetoks_new for token in phrasetoks: tokencounts[token] += 1 for l in zip(*[phrasetoks[i:] for i in range(2)]): tokencounts["_".join(l)] += 1 for token, count in tokencounts.most_common(): if count > 1: features_final.append(token) #print token, count return features_final
def convertTweetsToVec(topic="all", numtoks='all', phrasemodel=False, phrasemodelpath="phrase.model"): print("Reading tokens") tokens, tweets_on_topic, tweets = readToks(phrasemodel) if phrasemodel == True: bigram = Phrases(phrasemodelpath) if numtoks != "all": tokens_sub = tokens[:numtoks] else: tokens_sub = tokens numtoks = tokens.__sizeof__() tokenized_tweets = Tweets() vects = [] norm_tweets = [] print("Converting JSON tweets") if topic == 'all': #for topic in TOPICS: for tweet in tweets: vect = np.zeros( numtoks, dtype=bool ) # dimensionality. the most frequent tokens have a low index, then we can do a cutoff. original: 93988 norm_tweet = [] tokenized = tokenized_tweets.tweets.add() tokenized.tweet = tweet['text'] if phrasemodel == False: tokenised_tweet = tokenize(tweet['text']) else: tokens = filterStopwords(tokenize(tweet['text'].lower())) tokenised_tweet = bigram[tokens] for token in tokenised_tweet: try: index = tokens_sub.index(token) except ValueError: index = -1 if index > -1: vect[index] = 1 norm_tweet.append(token) else: norm_tweet.append('NULL') #print(norm_tweet) norm_tweets.append(norm_tweet) vects.append(vect) else: # discouraged, needs to be updated for index in tweets_on_topic[topic]: tweet = tweets[index] vect = np.zeros( numtoks ) # dimensionality. the most frequent tokens have a low index, then we can do a cutoff. original: 93988 norm_tweet = [] tokenized = tokenized_tweets.tweets.add() tokenized.tweet = tweet['text'] for token in tokenize(tweet['text']): try: index = tokens_sub.index(token) except ValueError: index = -1 if index > -1: vect[index] = 1 norm_tweet.append(token) else: norm_tweet.append('NULL') print(norm_tweet) norm_tweets.append(norm_tweet) vects.append(vect) print("Finished converting JSON tweets") return tokens_sub, vects, norm_tweets
feat_list = [] for word in words: feat_list.append(self.fe.get_features(word)) # Add context features feat_list = features.add_context_features(feat_list) self.fe.add_bigram_features(feat_list) # Create string to feed into Mallet feat_list_str = [] for word_feats in feat_list: feat_list_str.append(' '.join(word_feats)) self.tagger.stdin.write( ("\t".join(feat_list_str) + "\n").encode('utf8')) pos = [] for i in range(len(feat_list)): pos.append(self.tagger.stdout.readline().rstrip('\n').strip(' ')) self.nTagged += 1 return pos if __name__ == "__main__": posTagger = PosTagger() for line in sys.stdin: words = twokenize_wrapper.tokenize(line.strip()) if not words: continue pos = posTagger.TagSentence(words) print "%s\t%s\t%s" % (line, " ".join(words), " ".join(pos))
def extractW2VHashFeatures(w2vmodel, phrasemodel, mode, tweets, targets, labels): features = [] inv_topics = {v: k for k, v in tokenize_tweets.TOPICS_LONG.items()} for i, tweet in enumerate(tweets): # get the neut/pos/neg hashtags neut = KEYWORDS_NEUT[inv_topics[targets[i]]] pos = KEYWORDS_POS[inv_topics[targets[i]]] neg = KEYWORDS_NEG[inv_topics[targets[i]]] neutsim = w2vmodel.most_similar(neut, topn=60) possim = w2vmodel.most_similar(pos, topn=60) negsim = w2vmodel.most_similar(neg, topn=60) tokenised_tweet = tokenize(tweet.lower()) words = filterStopwords(tokenised_tweet) neutcnt, poscnt, negcnt, neutsimp, possimp, negsimp = 0, 0, 0, 0, 0, 0 # transform, as earlier, with the phrase model for token in phrasemodel[words]: if neut == token: neutsimp = 1 if pos == token: possimp = 1 if neg == token: negsimp = 1 for n, sc in neutsim: if sc >= 0.4 and n == token: neutcnt += 1 for n, sc in possim: if sc >= 0.4 and n == token: poscnt += 1 for n, sc in negsim: if sc >= 0.4 and n == token: negcnt += 1 #print targets[i], "\t", labels[i], "\t", neutcnt, "\t", poscnt, "\t", negcnt, "\t", neutsimp, "\t", possimp, "\t", negsimp #featint = [neutcnt, poscnt, negcnt, neutsimp, possimp, negsimp] pn = 0 if possim and negsim: pn = 1 possimp = 0 negsimp = 0 if mode == "hash": featint = [neutsimp, possimp, negsimp, pn] features.append(featint) if mode == "w2v_hash": featint = [neutcnt, poscnt, negcnt, neutsimp, possimp, negsimp, pn] features.append(featint) featlabels = [] if mode == "hash": featlabels = ["neut_hash", "pos_hash", "neg_hash", "posneg_hash"] if mode == "w2v_hash": featlabels = [ "neut_extw2v", "pos_extw2v", "neg_extw2v", "neut_hash", "pos_hash", "neg_hash", "posneg_hash" ] return features, featlabels
self.GetTagger() feat_list = [] for word in words: feat_list.append(self.fe.get_features(word)) # Add context features feat_list = features.add_context_features(feat_list) self.fe.add_bigram_features(feat_list) # Create string to feed into Mallet feat_list_str = [] for word_feats in feat_list: feat_list_str.append(' '.join(word_feats)) self.tagger.stdin.write(("\t".join(feat_list_str) + "\n").encode('utf8')) pos = [] for i in range(len(feat_list)): pos.append(self.tagger.stdout.readline().rstrip('\n').strip(' ')) self.nTagged += 1 return pos if __name__ == "__main__": posTagger = PosTagger() for line in sys.stdin: words = twokenize_wrapper.tokenize(line.strip()) if not words: continue pos = posTagger.TagSentence(words) print "%s\t%s\t%s" % (line, " ".join(words), " ".join(pos))
def create_project(sent_file, template_folder, new_proj_folder, new_proj_name): # Check that directories/files exist (or don't) if not os.path.isdir(template_folder): print 'Template folder DNE: ', template_folder return elif os.path.isdir(new_proj_folder): print 'Will overwrite the following directory: ', new_proj_folder return elif not os.path.exists(sent_file): print 'Sentence files does not exist: ', sent_file return # Make sure the project name doesn't have spaces new_proj_name = re.sub(' ', '_', new_proj_name) # Copy over all standard mmax2 files shutil.copytree(template_folder, new_proj_folder) f = open(os.path.join(new_proj_folder, new_proj_name + '.mmax'), 'w') f.write(MMAX_STRING % new_proj_name) f.close() # Creating word, pos, and sentence files f_word = open(os.path.join(new_proj_folder, new_proj_name + '_words.xml'), 'w') f_pos = open(os.path.join(new_proj_folder, new_proj_name + '_POS_level.xml'), 'w') f_sent = open(os.path.join(new_proj_folder, new_proj_name + '_sentence_level.xml'), 'w') # Add headers f_word.write(WORDS_HEADER_STRING) f_pos.write(POS_HEADER_STRING) f_sent.write(SENT_HEADER_STRING) # For each sentence(tweet) in the file. tweet_tokens_list = [] for tweet in open(sent_file): tweet_tokens_list.append(twokenize_wrapper.tokenize(tweet.strip())) # Load POS tagger and tag the tweets pos_tagger = mallet_wrapper.MalletPOSTagger(_model_location, _token2pos, _token, _temp_dir) pos_tagged_tweets = pos_tagger.pos_tag_tweets(tweet_tokens_list) # Add markables word_count = 1 tweet_count = 1 for tagged_tweet in pos_tagged_tweets: start_count = word_count # For each word/pos in the sentence for word, pos in tagged_tweet: # Check if the word is a user, RT, or hash tag new_pos = symbol_tag.tag_token(word) if new_pos: pos = new_pos f_word.write(WORD_STRING % (word_count, word)) f_pos.write(POS_STRING % (word_count, word_count, pos.lower())) word_count += 1 f_sent.write(SENT_STRING % (tweet_count, start_count, word_count - 1)) tweet_count += 1 # Add closing tags f_word.write('</words>') f_pos.write('</markables>') f_sent.write('</markables>') f_word.close() f_pos.close() f_sent.close()
tweets_on_topic = defaultdict(list) for topic in topics: for index, tweet in enumerate(tweets): for keyword in keywords[topic]: if keyword in tweet['text'].lower(): tweets_on_topic[topic].append(index) break for topic in topics: tokenized_tweets = Tweets() for index in tweets_on_topic[topic]: tweet = tweets[index] tokenized = tokenized_tweets.tweets.add() tokenized.tweet = tweet['text'] for token in tokenize(tweet['text']): try: index = tokens.index(token) tokenized.tokens.append(index) except ValueError: tokenized.tokens.append(-1) f = open(topic + '.tweets', "wb") f.write(tokenized_tweets.SerializeToString()) f.close()
def extractW2VHashFeatures(w2vmodel, phrasemodel, mode, tweets, targets, labels): features = [] inv_topics = {v: k for k, v in tokenize_tweets.TOPICS_LONG.items()} for i, tweet in enumerate(tweets): # get the neut/pos/neg hashtags neut = KEYWORDS_NEUT[inv_topics[targets[i]]] pos = KEYWORDS_POS[inv_topics[targets[i]]] neg = KEYWORDS_NEG[inv_topics[targets[i]]] neutsim = w2vmodel.most_similar(neut, topn=60) possim = w2vmodel.most_similar(pos, topn=60) negsim = w2vmodel.most_similar(neg, topn=60) tokenised_tweet = tokenize(tweet.lower()) words = filterStopwords(tokenised_tweet) neutcnt, poscnt, negcnt, neutsimp, possimp, negsimp = 0, 0, 0, 0, 0, 0 # transform, as earlier, with the phrase model for token in phrasemodel[words]: if neut == token: neutsimp = 1 if pos == token: possimp = 1 if neg == token: negsimp = 1 for n, sc in neutsim: if sc >= 0.4 and n == token: neutcnt += 1 for n, sc in possim: if sc >= 0.4 and n == token: poscnt += 1 for n, sc in negsim: if sc >= 0.4 and n == token: negcnt += 1 #print targets[i], "\t", labels[i], "\t", neutcnt, "\t", poscnt, "\t", negcnt, "\t", neutsimp, "\t", possimp, "\t", negsimp #featint = [neutcnt, poscnt, negcnt, neutsimp, possimp, negsimp] pn = 0 if possim and negsim: pn = 1 possimp = 0 negsimp = 0 if mode == "hash": featint = [neutsimp, possimp, negsimp, pn] features.append(featint) if mode == "w2v_hash": featint = [neutcnt, poscnt, negcnt, neutsimp, possimp, negsimp, pn] features.append(featint) featlabels = [] if mode == "hash": featlabels = ["neut_hash", "pos_hash", "neg_hash", "posneg_hash"] if mode == "w2v_hash": featlabels = ["neut_extw2v", "pos_extw2v", "neg_extw2v", "neut_hash", "pos_hash", "neg_hash", "posneg_hash"] return features, featlabels