def extractW2VAggrFeatures(w2vmodel, phrasemodel, tweets, targets, labels): feats = [] # for each tweet, multiply the word vectors for i, tweet in enumerate(tweets): tokenised_tweet = tokenize(tweet.lower()) words = filterStopwords(tokenised_tweet) numvects = 0 vect = [] for token in phrasemodel[words]: try: s = w2vmodel[token] vect.append(s) numvects += 1 except KeyError: s = 0.0 if vect.__len__() > 0: mtrmean = np.average(vect, axis=0) if i == 0: feats = mtrmean else: feats = np.vstack((feats, mtrmean)) else: feats = np.vstack((feats, np.zeros(300))) # 300-dimensional vector for now return feats
def prepData(stopfilter, multiword, useDev=False): print("Preparing data...") ret = [] # list of lists print("Reading data...") tweets = readTweets() tweets_train, targets_train, labels_train = readTweetsOfficial( tokenize_tweets.FILETRAIN, 'windows-1252', 2) tweets_trump, targets_trump, labels_trump = readTweetsOfficial( tokenize_tweets.FILETRUMP, 'utf-8', 1) print(str(len(tweets))) tweets.extend(tweets_train) print(str(len(tweets_train)), "\t", str(len(tweets))) tweets.extend(tweets_trump) print(str(len(tweets_trump)), "\t", str(len(tweets))) if useDev == True: tweets_dev, targets_dev, labels_dev = readTweetsOfficial( tokenize_tweets.FILEDEV, 'windows-1252', 2) tweets.extend(tweets_dev) print(str(len(tweets_dev)), "\t", str(len(tweets))) print("Tokenising...") for tweet in tweets: tokenised_tweet = tokenize(tweet.lower()) if stopfilter: words = filterStopwords(tokenised_tweet) ret.append(words) else: ret.append(tokenised_tweet) if multiword: return learnMultiword(ret) else: return ret
def extractW2VAggrFeatures(w2vmodel, phrasemodel, tweets, targets, labels): feats = [] # for each tweet, multiply the word vectors for i, tweet in enumerate(tweets): tokenised_tweet = tokenize(tweet.lower()) words = filterStopwords(tokenised_tweet) numvects = 0 vect = [] for token in phrasemodel[words]: try: s = w2vmodel[token] vect.append(s) numvects += 1 except KeyError: s = 0.0 if vect.__len__() > 0: mtrmean = np.average(vect, axis=0) if i == 0: feats = mtrmean else: feats = np.vstack((feats, mtrmean)) else: feats = np.vstack( (feats, np.zeros(300))) # 300-dimensional vector for now return feats
def prepData(stopfilter, multiword, useDev=False): print("Preparing data...") ret = [] # list of lists print("Reading data...") tweets = readTweets() tweets_train, targets_train, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2) tweets_trump, targets_trump, labels_trump = readTweetsOfficial(tokenize_tweets.FILETRUMP, 'utf-8', 1) print(str(len(tweets))) tweets.extend(tweets_train) print(str(len(tweets_train)), "\t" , str(len(tweets))) tweets.extend(tweets_trump) print(str(len(tweets_trump)), "\t" , str(len(tweets))) if useDev == True: tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2) tweets.extend(tweets_dev) print(str(len(tweets_dev)), "\t" , str(len(tweets))) print("Tokenising...") for tweet in tweets: tokenised_tweet = tokenize(tweet.lower()) if stopfilter: words = filterStopwords(tokenised_tweet) ret.append(words) else: ret.append(tokenised_tweet) if multiword: return learnMultiword(ret) else: return ret
def extractFeatureVocab(tweets, keyword="all", usephrasemodel=True, phrasemodel="phrase.model", anon_targets=False): tokencounts = Counter() features_final = [] bigram = Phrases(phrasemodel) #tokens_topic = [] #if keyword == "all": # for top in tokenize_tweets.TOPICS: # if top != 'clinton': # for tok in tokenize(tokenize_tweets.TOPICS_LONG[top]): # tokens_topic.append(tok) #else: # tokens_topic = tokenize(tokenize_tweets.TOPICS_LONG[keyword]) for tweet in tweets: if usephrasemodel == False: tokenised_tweet = tokenize(tweet) for token in tokenised_tweet: #unigram features tokencounts[token] += 1 #for toktopic in tokens_topic: # tokencounts[toktopic + '|' + token] += 1 for l in zip(*[tokenised_tweet[i:] for i in range(2)]): #bigram features tokencounts["_".join(l)] += 1 #for ltop in zip(*[tokens_topic[i:] for i in range(2)]): # tokencounts["_".join(ltop) + '|' + "_".join(l)] += 1 else: # this includes unigrams and frequent bigrams tokens = filterStopwords(tokenize(tweet.lower())) #For Trump it's [1] phrasetoks = bigram[tokens] target_keywords = [] if anon_targets==True: for top in tokenize_tweets.TOPICS: if top == "climate": # hack, this is the only non-list value target_keywords.append("climate") else: #for keyw in tokenize_tweets.KEYWORDS[top]: target_keywords.extend(tokenize_tweets.KEYWORDS[top]) phrasetoks_new = [] for token in phrasetoks: for keyw in target_keywords: if keyw in token: token = token.replace(keyw, "TARGET") phrasetoks_new.append(token) phrasetoks = phrasetoks_new for token in phrasetoks: tokencounts[token] += 1 for l in zip(*[phrasetoks[i:] for i in range(2)]): tokencounts["_".join(l)] += 1 for token, count in tokencounts.most_common(): if count > 1: features_final.append(token) #print token, count return features_final
def extractFeaturesBOW(tweets, targets, features_final, anon_targets=False, usephrasemodel=False, phrasemodel="phrase.model"): bigram = Phrases(phrasemodel) matrix = [] # np.zeros((len(features_final), len(tweets))) for i, tweet in enumerate(tweets): vect = np.zeros((len(features_final))) if usephrasemodel == False: tokenised_tweet = tokenize(tweet) for token in tokenised_tweet: insertIntoVect(features_final, vect, token) #for toktopic in tokens_topic: # insertIntoVect(features_final, vect, toktopic + '|' + token) for l in zip(*[tokenised_tweet[i:] for i in range(2)]): insertIntoVect(features_final, vect, "_".join(l)) #for ltop in zip(*[tokens_topic[i:] for i in range(2)]): # insertIntoVect(features_final, vect, "_".join(ltop) + '|' + "_".join(l)) else: inv_topics = {v: k for k, v in tokenize_tweets.TOPICS_LONG.items()} target_keywords = tokenize_tweets.KEYWORDS.get( inv_topics.get(targets[i])) tokens = filterStopwords(tokenize( tweet.lower())) #For Trump it's [1] phrasetoks = bigram[tokens] if anon_targets == True: phrasetoks_new = [] for token in phrasetoks: if target_keywords == "climate": if target_keywords in token: token = token.replace(keyw, "TARGET") else: for keyw in target_keywords: if keyw in token: token = token.replace(keyw, "TARGET") phrasetoks_new.append(token) phrasetoks = phrasetoks_new for token in phrasetoks: insertIntoVect(features_final, vect, token) for l in zip(*[phrasetoks[i:] for i in range(2)]): insertIntoVect(features_final, vect, "_".join(l)) matrix.append(vect) #print " ".join(str(v) for v in vect), "\n" return matrix
def extractFeaturesBOW(tweets, targets, features_final, anon_targets=False, usephrasemodel=False, phrasemodel="phrase.model"): bigram = Phrases(phrasemodel) matrix = [] # np.zeros((len(features_final), len(tweets))) for i, tweet in enumerate(tweets): vect = np.zeros((len(features_final))) if usephrasemodel == False: tokenised_tweet = tokenize(tweet) for token in tokenised_tweet: insertIntoVect(features_final, vect, token) #for toktopic in tokens_topic: # insertIntoVect(features_final, vect, toktopic + '|' + token) for l in zip(*[tokenised_tweet[i:] for i in range(2)]): insertIntoVect(features_final, vect, "_".join(l)) #for ltop in zip(*[tokens_topic[i:] for i in range(2)]): # insertIntoVect(features_final, vect, "_".join(ltop) + '|' + "_".join(l)) else: inv_topics = {v: k for k, v in tokenize_tweets.TOPICS_LONG.items()} target_keywords = tokenize_tweets.KEYWORDS.get(inv_topics.get(targets[i])) tokens = filterStopwords(tokenize(tweet.lower())) #For Trump it's [1] phrasetoks = bigram[tokens] if anon_targets==True: phrasetoks_new = [] for token in phrasetoks: if target_keywords == "climate": if target_keywords in token: token = token.replace(keyw, "TARGET") else: for keyw in target_keywords: if keyw in token: token = token.replace(keyw, "TARGET") phrasetoks_new.append(token) phrasetoks = phrasetoks_new for token in phrasetoks: insertIntoVect(features_final, vect, token) for l in zip(*[phrasetoks[i:] for i in range(2)]): insertIntoVect(features_final, vect, "_".join(l)) matrix.append(vect) #print " ".join(str(v) for v in vect), "\n" return matrix
def extractW2VFeaturesSim(w2vmodelfile, phrasemodel, tweets, targets, labels): phmodel = Phrases.load(phrasemodel) w2vmodel = word2vec.Word2Vec.load(w2vmodelfile) inv_topics = {v: k for k, v in tokenize_tweets.TOPICS_LONG.items()} for i, tweet in enumerate(tweets): # get the neut/pos/neg hashtags neut = KEYWORDS_NEUT[inv_topics[targets[i]]] pos = KEYWORDS_POS[inv_topics[targets[i]]] neg = KEYWORDS_NEG[inv_topics[targets[i]]] tokenised_tweet = tokenize(tweet.lower()) words = filterStopwords(tokenised_tweet) neutcnt, poscnt, negcnt = 0, 0, 0 neutsc, possc, negsc = 0.0, 0.0, 0.0 # transform, as earlier, with the phrase model for token in phmodel[words]: try: neutsim = w2vmodel.similarity(neut, token) neutcnt += 1 neutsc += neutsim except KeyError: neutsim = 0 try: possim = w2vmodel.similarity(pos, token) possc += possim poscnt += 1 except KeyError: possim = 0 try: negsim = w2vmodel.similarity(neg, token) negsc += negsim negcnt += 1 except KeyError: negsim = 0 #print targets[i], "\t", token, "\t", neutsim, "\t", possim, "\t", negsim neutsc_tweet = neutsc/neutcnt possc_tweet = possc/poscnt negsc_tweet = negsc/negcnt print(targets[i], "\t", labels[i], "\t", neutsc_tweet, "\t", possc_tweet, "\t", negsc_tweet)
def extractW2VFeaturesSim(w2vmodelfile, phrasemodel, tweets, targets, labels): phmodel = Phrases.load(phrasemodel) w2vmodel = word2vec.Word2Vec.load(w2vmodelfile) inv_topics = {v: k for k, v in tokenize_tweets.TOPICS_LONG.items()} for i, tweet in enumerate(tweets): # get the neut/pos/neg hashtags neut = KEYWORDS_NEUT[inv_topics[targets[i]]] pos = KEYWORDS_POS[inv_topics[targets[i]]] neg = KEYWORDS_NEG[inv_topics[targets[i]]] tokenised_tweet = tokenize(tweet.lower()) words = filterStopwords(tokenised_tweet) neutcnt, poscnt, negcnt = 0, 0, 0 neutsc, possc, negsc = 0.0, 0.0, 0.0 # transform, as earlier, with the phrase model for token in phmodel[words]: try: neutsim = w2vmodel.similarity(neut, token) neutcnt += 1 neutsc += neutsim except KeyError: neutsim = 0 try: possim = w2vmodel.similarity(pos, token) possc += possim poscnt += 1 except KeyError: possim = 0 try: negsim = w2vmodel.similarity(neg, token) negsc += negsim negcnt += 1 except KeyError: negsim = 0 #print targets[i], "\t", token, "\t", neutsim, "\t", possim, "\t", negsim neutsc_tweet = neutsc / neutcnt possc_tweet = possc / poscnt negsc_tweet = negsc / negcnt print(targets[i], "\t", labels[i], "\t", neutsc_tweet, "\t", possc_tweet, "\t", negsc_tweet)
def extractFeatureVocab(tweets, keyword="all", usephrasemodel=True, phrasemodel="phrase.model", anon_targets=False): tokencounts = Counter() features_final = [] bigram = Phrases(phrasemodel) #tokens_topic = [] #if keyword == "all": # for top in tokenize_tweets.TOPICS: # if top != 'clinton': # for tok in tokenize(tokenize_tweets.TOPICS_LONG[top]): # tokens_topic.append(tok) #else: # tokens_topic = tokenize(tokenize_tweets.TOPICS_LONG[keyword]) for tweet in tweets: if usephrasemodel == False: tokenised_tweet = tokenize(tweet) for token in tokenised_tweet: #unigram features tokencounts[token] += 1 #for toktopic in tokens_topic: # tokencounts[toktopic + '|' + token] += 1 for l in zip(*[tokenised_tweet[i:] for i in range(2)]): #bigram features tokencounts["_".join(l)] += 1 #for ltop in zip(*[tokens_topic[i:] for i in range(2)]): # tokencounts["_".join(ltop) + '|' + "_".join(l)] += 1 else: # this includes unigrams and frequent bigrams tokens = filterStopwords(tokenize( tweet.lower())) #For Trump it's [1] phrasetoks = bigram[tokens] target_keywords = [] if anon_targets == True: for top in tokenize_tweets.TOPICS: if top == "climate": # hack, this is the only non-list value target_keywords.append("climate") else: #for keyw in tokenize_tweets.KEYWORDS[top]: target_keywords.extend(tokenize_tweets.KEYWORDS[top]) phrasetoks_new = [] for token in phrasetoks: for keyw in target_keywords: if keyw in token: token = token.replace(keyw, "TARGET") phrasetoks_new.append(token) phrasetoks = phrasetoks_new for token in phrasetoks: tokencounts[token] += 1 for l in zip(*[phrasetoks[i:] for i in range(2)]): tokencounts["_".join(l)] += 1 for token, count in tokencounts.most_common(): if count > 1: features_final.append(token) #print token, count return features_final
def extractW2VHashFeatures(w2vmodel, phrasemodel, mode, tweets, targets, labels): features = [] inv_topics = {v: k for k, v in tokenize_tweets.TOPICS_LONG.items()} for i, tweet in enumerate(tweets): # get the neut/pos/neg hashtags neut = KEYWORDS_NEUT[inv_topics[targets[i]]] pos = KEYWORDS_POS[inv_topics[targets[i]]] neg = KEYWORDS_NEG[inv_topics[targets[i]]] neutsim = w2vmodel.most_similar(neut, topn=60) possim = w2vmodel.most_similar(pos, topn=60) negsim = w2vmodel.most_similar(neg, topn=60) tokenised_tweet = tokenize(tweet.lower()) words = filterStopwords(tokenised_tweet) neutcnt, poscnt, negcnt, neutsimp, possimp, negsimp = 0, 0, 0, 0, 0, 0 # transform, as earlier, with the phrase model for token in phrasemodel[words]: if neut == token: neutsimp = 1 if pos == token: possimp = 1 if neg == token: negsimp = 1 for n, sc in neutsim: if sc >= 0.4 and n == token: neutcnt += 1 for n, sc in possim: if sc >= 0.4 and n == token: poscnt += 1 for n, sc in negsim: if sc >= 0.4 and n == token: negcnt += 1 #print targets[i], "\t", labels[i], "\t", neutcnt, "\t", poscnt, "\t", negcnt, "\t", neutsimp, "\t", possimp, "\t", negsimp #featint = [neutcnt, poscnt, negcnt, neutsimp, possimp, negsimp] pn = 0 if possim and negsim: pn = 1 possimp = 0 negsimp = 0 if mode == "hash": featint = [neutsimp, possimp, negsimp, pn] features.append(featint) if mode == "w2v_hash": featint = [neutcnt, poscnt, negcnt, neutsimp, possimp, negsimp, pn] features.append(featint) featlabels = [] if mode == "hash": featlabels = ["neut_hash", "pos_hash", "neg_hash", "posneg_hash"] if mode == "w2v_hash": featlabels = [ "neut_extw2v", "pos_extw2v", "neg_extw2v", "neut_hash", "pos_hash", "neg_hash", "posneg_hash" ] return features, featlabels
def extractW2VHashFeatures(w2vmodel, phrasemodel, mode, tweets, targets, labels): features = [] inv_topics = {v: k for k, v in tokenize_tweets.TOPICS_LONG.items()} for i, tweet in enumerate(tweets): # get the neut/pos/neg hashtags neut = KEYWORDS_NEUT[inv_topics[targets[i]]] pos = KEYWORDS_POS[inv_topics[targets[i]]] neg = KEYWORDS_NEG[inv_topics[targets[i]]] neutsim = w2vmodel.most_similar(neut, topn=60) possim = w2vmodel.most_similar(pos, topn=60) negsim = w2vmodel.most_similar(neg, topn=60) tokenised_tweet = tokenize(tweet.lower()) words = filterStopwords(tokenised_tweet) neutcnt, poscnt, negcnt, neutsimp, possimp, negsimp = 0, 0, 0, 0, 0, 0 # transform, as earlier, with the phrase model for token in phrasemodel[words]: if neut == token: neutsimp = 1 if pos == token: possimp = 1 if neg == token: negsimp = 1 for n, sc in neutsim: if sc >= 0.4 and n == token: neutcnt += 1 for n, sc in possim: if sc >= 0.4 and n == token: poscnt += 1 for n, sc in negsim: if sc >= 0.4 and n == token: negcnt += 1 #print targets[i], "\t", labels[i], "\t", neutcnt, "\t", poscnt, "\t", negcnt, "\t", neutsimp, "\t", possimp, "\t", negsimp #featint = [neutcnt, poscnt, negcnt, neutsimp, possimp, negsimp] pn = 0 if possim and negsim: pn = 1 possimp = 0 negsimp = 0 if mode == "hash": featint = [neutsimp, possimp, negsimp, pn] features.append(featint) if mode == "w2v_hash": featint = [neutcnt, poscnt, negcnt, neutsimp, possimp, negsimp, pn] features.append(featint) featlabels = [] if mode == "hash": featlabels = ["neut_hash", "pos_hash", "neg_hash", "posneg_hash"] if mode == "w2v_hash": featlabels = ["neut_extw2v", "pos_extw2v", "neg_extw2v", "neut_hash", "pos_hash", "neg_hash", "posneg_hash"] return features, featlabels