def gen_sequence(filename): y_map = { 'joy': 0, 'anger': 1, 'surprise': 2, 'disgust':3, 'fear':4, 'sad':5 } X, y = [], [] flag = True if filename == 'tokenized_tweets_train.txt': for tweet in train_tweets: text = glove_tokenize(tweet['text'].lower()) seq, _emb = [], [] for word in text: seq.append(vocab.get(word, vocab['UNK'])) X.append(seq) y.append(y_map[tweet['label']]) return X, y else: for tweet in test_tweets: text = glove_tokenize(tweet['text'].lower()) seq, _emb = [], [] for word in text: seq.append(vocab.get(word, vocab['UNK'])) X.append(seq) y.append(y_map[tweet['label']]) return X, y
def gen_vocab(): # Processing vocab_index = 1 for tweet in train_tweets: text = glove_tokenize(tweet['text'].lower()) text = ' '.join([c for c in text if c not in punctuation]) words = text.split() # words = [word for word in words if word not in STOPWORDS] for word in words: if word not in vocab: vocab[word] = vocab_index reverse_vocab[vocab_index] = word # generate reverse vocab as well vocab_index += 1 freq[word] += 1 for tweet in test_tweets: text = glove_tokenize(tweet['text'].lower()) text = ' '.join([c for c in text if c not in punctuation]) words = text.split() # words = [word for word in words if word not in STOPWORDS] for word in words: if word not in vocab: vocab[word] = vocab_index reverse_vocab[vocab_index] = word # generate reverse vocab as well vocab_index += 1 freq[word] += 1 vocab['UNK'] = len(vocab) + 1 reverse_vocab[len(vocab)] = 'UNK'
def gen_sequence(): y_map = {'none': 0, 'racism': 1, 'sexism': 2} X, y = [], [] flag = True for tweet in tweets: text = glove_tokenize(tweet['text'].lower()) seq, _emb = [], [] for word in text: seq.append(vocab.get(word, vocab['UNK'])) X.append(seq) y.append(y_map[tweet['label']]) return X, y
def gen_data(): X, y = pd.read_csv('data/SD_dataset_FINAL.csv') X_e = [] for s in X: words = glove_tokenize(s) emb = np.zeros(word_embed_size) for word in words: try: emb += word2vec_model[word] except: pass emb /= len(words) X_e.append(emb) return X_e, y
def select_tweets_whose_embedding_exists(tweets, word2vec_model): X, Y = [], [] tweet_return = [] for tweet in tweets: _emb = 0 words = glove_tokenize(tweet['text']) for w in words: if w in word2vec_model: # Check if embeeding there in GLove model _emb += 1 if _emb: # Not a blank tweet tweet_return.append(tweet) print('Tweets selected:', len(tweet_return)) print(len(tweet_return)) return tweet_return
def select_tweets(filename): # selects the tweets as in mean_glove_embedding method # Processing if filename == 'tokenized_tweets_train.txt': train_tweets = get_data('tokenized_tweets_train.txt') elif filename == 'tokenized_tweets_test.txt': test_tweets = get_data('tokenized_tweets_test.txt') tweet_return = [] if filename == 'tokenized_tweets_train.txt': c = 1 for tweet in train_tweets: _emb = 0 words = glove_tokenize(tweet['text'].lower()) for w in words: if w in word2vec_model: # Check if embeeding there in GLove model _emb+=1 c = c+1 # if _emb: # Not a blank tweet tweet_return.append(tweet) print('Tweets selected:', len(tweet_return)) #pdb.set_trace() return tweet_return else: c = 1 for tweet in test_tweets: _emb = 0 words = glove_tokenize(tweet['text'].lower()) for w in words: if w in word2vec_model: # Check if embeeding there in GLove model _emb+=1 c = c+1 # if _emb: # Not a blank tweet tweet_return.append(tweet) print('Tweets selected:', len(tweet_return)) #pdb.set_trace() return tweet_return
def getAbusiveFeatures(): f = open('abusive_dict.txt', 'r') m = {} for line in f: line = line.strip() m[line] = True tweets = get_data() X = [] for tweet in tweets: text = glove_tokenize(tweet['text'].lower()) c = 0 for word in text: if word in m: c = c + 1 X.append(c) return np.array(X)
def gen_vocab(): # Processing vocab_index = 1 for tweet in tweets: text = glove_tokenize(tweet['text'].lower()) text = ' '.join([c for c in text if c not in punctuation]) words = text.split() for word in words: if word not in vocab: vocab[word] = vocab_index reverse_vocab[vocab_index] = word vocab_index += 1 freq[word] += 1 vocab['UNK'] = len(vocab) + 1 reverse_vocab[len(vocab)] = 'UNK'
def gen_data(): y_map = {'none': 0, 'racism': 1, 'sexism': 2} X, y = [], [] for tweet in tweets: words = glove_tokenize(tweet['text'].lower()) emb = np.zeros(EMBEDDING_DIM) for word in words: try: emb += word2vec_model[word] except: pass emb /= len(words) X.append(emb) y.append(y_map[tweet['label']]) return X, y
def select_tweets(dataset, strategy): # selects the tweets as in mean_glove_embedding method # Processing tweets, users_none = get_data_waseem4(dataset, strategy) X, Y = [], [] tweet_return = [] for tweet in tweets: _emb = 0 words = glove_tokenize(tweet['text'].lower()) for w in words: if w in word2vec_model: # Check if embeeding there in GLove model _emb += 1 if _emb: # Not a blank tweet tweet_return.append(tweet) #pdb.set_trace() return tweet_return, users_none
def select_tweets_whose_embedding_exists(): # selects the tweets as in mean_glove_embedding method # Processing tweets = get_data() X, Y = [], [] tweet_return = [] for tweet in tweets: _emb = 0 words = glove_tokenize(tweet['text']) for w in words: if w in word2vec_model: # Check if embeeding there in GLove model _emb += 1 if _emb: # Not a blank tweet tweet_return.append(tweet) print('Tweets selected:', len(tweet_return)) #pdb.set_trace() return tweet_return
def gen_data(tweets): y_map = {'none': 0, 'racism': 1, 'sexism': 2} X, y = [], [] for tweet in tweets: words = glove_tokenize(tweet['text']) emb = np.zeros(word_embed_size) for word in words: try: emb += word2vec_model[word] except: pass emb /= len(words) X.append(emb) y.append(y_map[tweet['label']]) X = np.array(X) y = np.array(y) return X, y
def get_tfidf_features(): tweets = get_data() X, y = [], [] flag = True for tweet in tweets: text = glove_tokenize(tweet['text'].lower()) text = ' '.join([c for c in text if c not in punctuation]) if y_map[tweet['label']] == 2: X.append(text) y.append(int([tweet['label']])) tfidf_transformer = TfidfVectorizer(ngram_range=(1, 2), analyzer='word', stop_words='english', max_features=2000) X_tfidf = tfidf_transformer.fit_transform(X) print(X_tfidf.shape) get_top_features(tfidf_transformer) return X_tfidf, np.array(y)
def get_tfidf_features(): tweets = get_data() # getting list of tweets (each tweet in a map format with keys text, label and user) y_map = { 'none': 0, 'racism': 1, 'sexism': 2 } X, y = [], [] flag = True for tweet in tweets: text = glove_tokenize(tweet['text'].lower()) # tokenizing like converting # into <hashtag> etc. text = ' '.join([c for c in text if c not in punctuation]) # removing punctuation X.append(text) y.append(y_map[tweet['label']]) tfidf_transformer = TfidfVectorizer(ngram_range=(1,2), analyzer='word',stop_words='english',max_features=5000) X_tfidf = tfidf_transformer.fit_transform(X) print(X_tfidf.shape) return X_tfidf, np.array(y)
def getAbusiveFeatures(): y_map = { 'none': 0, 'racism': 1, 'sexism': 2 } f = open('abusive_dict.txt','r') m = {} for line in f: line = line.strip() m[line]=True tweets = get_data() X, y = [], [] for tweet in tweets: text = glove_tokenize(tweet['text'].lower()) # does it correct spelling as well? c = 0 for word in text: if word in m: c = c+1 X.append([c]) y.append(y_map[tweet['label']]) return np.array(X),np.array(y)
def gen_data(): # In this function, for all accepted tweets, we turn them into an # embedding of EMBEDDING_DIM. We then sum the embeddings of all # words within the tweet that have an embedding and divide # by the number of words. Hence, the final embedding of the tweet # will be the average of the embeddings of its words. X_file = "BoWV_X.pickle" y_file = "BoWV_y.pickle" # Load if pickled files are available try: X = pickle.load(open(X_file, "rb")) y = pickle.load(open(y_file, "rb")) print "Features and labels loaded from pickled files." # Create and save otherwise except (OSError, IOError) as e: print "Creating features and labels..." y_map = {'none': 0, 'racism': 1, 'sexism': 2} X, y = [], [] for tweet in tweets: words = glove_tokenize(tweet['text'].lower()) emb = np.zeros(EMBEDDING_DIM) for word in words: try: emb += word2vec_model[word] except: pass emb /= len(words) X.append(emb) y.append(y_map[tweet['label']]) pickle.dump(X, open(X_file, "wb")) pickle.dump(y, open(y_file, "wb")) return X, y
def gen_data(tweets_list, word2vec_model, flag): if flag == 'binary': y_map = {'none': 0, 'racism': 1, 'sexism': 1, 'hate': 1} else: y_map = {'none': 0, 'racism': 1, 'sexism': 2, 'hate': 1} X, y = [], [] word_embed_size = 200 for tweet in tweets_list: words = glove_tokenize(tweet['text']) emb = np.zeros(word_embed_size) for word in words: try: emb += word2vec_model[word] except: pass emb /= len(words) X.append(emb) y.append(y_map[tweet['label']]) X = np.array(X) y = np.array(y) return X, y
def gen_data(): # Generate features and labels # Features will be given by the average # embedding for all words in the sentence. y_map = {'none': 0, 'racism': 1, 'sexism': 2} X, y = [], [] for tweet in tweets: words = glove_tokenize(tweet['text']) # .lower() emb = np.zeros(word_embed_size) for word in words: try: emb += word2vec_model[word] except: pass emb /= len(words) X.append(emb) y.append(y_map[tweet['label']]) X = np.array(X) y = np.array(y) return X, y
def get_liwc_features_from_text(): filenames = glob.glob("./LIWC_features/*.csv") print(filenames) y_map = { 'none': 0, 'racism': 1, 'sexism': 2 } tweets = get_data() X, y = [], [] # create a dict of lists of words in all liwc files features_dict = {} for file in filenames: f = open(file,'r') m = {} for line in f: line = line.strip() m[line]=True features_dict[file] = m for tweet in tweets: text = glove_tokenize(tweet['text'].lower()) features = [] for file in filenames: c = 1 for word in text: if any([word.startswith(s) for s in features_dict[file]]): c = c+1 features.append(c) X.append(features) y.append(y_map[tweet['label']]) # normalised results X = np.array(X) X = (X - X.mean(axis=0)) / X.std(axis=0) return X, np.array(y)
from data_handler import get_data from my_tokenizer import glove_tokenize model = gensim.models.KeyedVectors.load_word2vec_format("glove_embeddings/glove.twitter.27B.200d.txt",binary=True) print('Finished loading original model %.2f min' % ((time.time()-start)/60)) print('word2vec: %d' % len(model.vocab)) indices_to_delete = [] j = 0 st= set() tweets = get_data() X, Y = [], [] tweet_return = [] for tweet in tweets: _emb = 0 words = glove_tokenize(tweet['text'].lower()) for w in words: st.update(w) for i,w in enumerate(model.index2word): l = w.strip().lower() found = False if l in st: found = True if found: model.vocab[w].index = j j += 1 else: del model.vocab[w] indices_to_delete.append(i)