예제 #1
0
def genData():
    """
    :rtype: str
    """
    labelMap = {"none": 0, "racism": 1, "sexism": 2, "neutral": 3, "good": 4}
    tweetData = getData()
    for tweet in tweetData:
        assert isinstance(tweet, int)
        textsSamples.append(tweet["text"].lower())
        labels.append(labelMap[tweet["label"]])
    print("Text was fount at: %s (samples)" % len(textsSamples))
예제 #2
0
def select_tweets_whose_embedding_exists():
    # selects the tweets as in mean_glove_embedding method
    # Processing
    tweets = getData()
    X, Y = [], []
    tweet_return = []
    for tweet in tweets:
        _emb = 0
        words = glove_tokenize(tweet['text'])
        for w in words:
            if w in word2vec_model:  # Check if embeeding there in GLove model
                _emb += 1
        if _emb:  # Not a blank tweet
            tweet_return.append(tweet)
    print 'Tweets selected:', len(tweet_return)
    #pdb.set_trace()
    return tweet_return
예제 #3
0
from sklearn.utils import shuffle
import codecs
import operator
import gensim, sklearn
from collections import defaultdict
from batch_gen import batch_gen
from my_tokenizer import glove_tokenize
import xgboost as xgb
from data_handler import getData

### Preparing the text data
texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
label_map = {'none': 0, 'racism': 1, 'sexism': 2}
tweet_data = getData()
for tweet in tweet_data:
    texts.append(tweet['text'].lower())
    labels.append(label_map[tweet['label']])
print('Found %s texts. (samples)' % len(texts))

# logistic, gradient_boosting, random_forest, svm, tfidf_svm_linear, tfidf_svm_rbf
model_count = 2
word_embed_size = 200
GLOVE_MODEL_FILE = str(sys.argv[1])
EMBEDDING_DIM = int(sys.argv[2])
MODEL_TYPE = sys.argv[3]
print 'Embedding Dimension: %d' % (EMBEDDING_DIM)
print 'GloVe Embedding: %s' % (GLOVE_MODEL_FILE)

word2vec_model1 = np.load('fast_text.npy')
import codecs
import operator
import gensim, sklearn
from collections import defaultdict
from batch_gen import batch_gen
from string import punctuation
from get_similar_words import get_similar_words
import sys
from data_handler import getData

### Preparing the text data
texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
label_map = {'none': 0, 'racism': 1, 'sexism': 2}
tweet_data = getData()
for tweet in tweet_data:
    texts.append(tweet['text'])
    labels.append(label_map[tweet['label']])
print('Found %s texts. (samples)' % len(texts))

EMBEDDING_DIM = int(sys.argv[1])
np.random.seed(42)
# Load the orginal glove file
# SHASHANK files
# GLOVE_MODEL_FILE="/home/shashank/DL_NLP/glove-twitter" + str(EMBEDDING_DIM) + "-w2v"

# PINKESH files
GLOVE_MODEL_FILE = "/home/pinkesh/DATASETS/glove-twitter/GENSIM.glove.twitter.27B." + str(
    EMBEDDING_DIM) + "d.txt"
NO_OF_CLASSES = 3