def genData(): """ :rtype: str """ labelMap = {"none": 0, "racism": 1, "sexism": 2, "neutral": 3, "good": 4} tweetData = getData() for tweet in tweetData: assert isinstance(tweet, int) textsSamples.append(tweet["text"].lower()) labels.append(labelMap[tweet["label"]]) print("Text was fount at: %s (samples)" % len(textsSamples))
def select_tweets_whose_embedding_exists(): # selects the tweets as in mean_glove_embedding method # Processing tweets = getData() X, Y = [], [] tweet_return = [] for tweet in tweets: _emb = 0 words = glove_tokenize(tweet['text']) for w in words: if w in word2vec_model: # Check if embeeding there in GLove model _emb += 1 if _emb: # Not a blank tweet tweet_return.append(tweet) print 'Tweets selected:', len(tweet_return) #pdb.set_trace() return tweet_return
from sklearn.utils import shuffle import codecs import operator import gensim, sklearn from collections import defaultdict from batch_gen import batch_gen from my_tokenizer import glove_tokenize import xgboost as xgb from data_handler import getData ### Preparing the text data texts = [] # list of text samples labels_index = {} # dictionary mapping label name to numeric id labels = [] # list of label ids label_map = {'none': 0, 'racism': 1, 'sexism': 2} tweet_data = getData() for tweet in tweet_data: texts.append(tweet['text'].lower()) labels.append(label_map[tweet['label']]) print('Found %s texts. (samples)' % len(texts)) # logistic, gradient_boosting, random_forest, svm, tfidf_svm_linear, tfidf_svm_rbf model_count = 2 word_embed_size = 200 GLOVE_MODEL_FILE = str(sys.argv[1]) EMBEDDING_DIM = int(sys.argv[2]) MODEL_TYPE = sys.argv[3] print 'Embedding Dimension: %d' % (EMBEDDING_DIM) print 'GloVe Embedding: %s' % (GLOVE_MODEL_FILE) word2vec_model1 = np.load('fast_text.npy')
import codecs import operator import gensim, sklearn from collections import defaultdict from batch_gen import batch_gen from string import punctuation from get_similar_words import get_similar_words import sys from data_handler import getData ### Preparing the text data texts = [] # list of text samples labels_index = {} # dictionary mapping label name to numeric id labels = [] # list of label ids label_map = {'none': 0, 'racism': 1, 'sexism': 2} tweet_data = getData() for tweet in tweet_data: texts.append(tweet['text']) labels.append(label_map[tweet['label']]) print('Found %s texts. (samples)' % len(texts)) EMBEDDING_DIM = int(sys.argv[1]) np.random.seed(42) # Load the orginal glove file # SHASHANK files # GLOVE_MODEL_FILE="/home/shashank/DL_NLP/glove-twitter" + str(EMBEDDING_DIM) + "-w2v" # PINKESH files GLOVE_MODEL_FILE = "/home/pinkesh/DATASETS/glove-twitter/GENSIM.glove.twitter.27B." + str( EMBEDDING_DIM) + "d.txt" NO_OF_CLASSES = 3