예제 #1
0
def load_data(path, word2id, cases_per_file=None):
    tweet_ids = []

    files = listdir(path)

    files = [path + '/' + file for file in files]

    features = pd.DataFrame()

    for file in files:
        if (cases_per_file == None):
            file_df = pd.read_csv(file, header=0, usecols=['text', 'label'])
        else:
            file_df = pd.read_csv(file,
                                  header=0,
                                  usecols=['text', 'label'],
                                  nrows=cases_per_file)
        features = pd.concat([features, file_df])

    labels = features.pop('label')
    features['text'] = features['text'].astype(str)

    for _, row in features.iterrows():
        string = pt.tokenize(row.values[0])
        split = string.split()
        tweet = [word2id.get(word, UNKNOWN_INDEX) for word in split]

        tweet_ids.append(tweet)

    labels = np.asarray(labels, dtype=np.int32)

    return tweet_ids, labels
예제 #2
0
 def tokenize(self, txt):
     '''Preprocess text and tokenize it'''
     txt = preprocess_twitter.tokenize(txt)
     txt = self.unescape_html(txt)
     txt = self.split_punctuations(txt)
     txt = self.split_emojis(txt)
     txt = self.replace_emojis(txt)
     words = self.tokenizer.tokenize(txt)
     return words
 def _text_preprocessor(self, text):        
     text = preprocess_twitter.tokenize(text)        
     text = casual.reduce_lengthening(text)
     text = cleanString(setupRegexes('twitterProAna'),text)  
     text = ' '.join([span for notentity,span in tweetPreprocessor(text, ("urls", "users", "lists")) if notentity]) 
     text = text.replace('\t','')
     text = text.replace('< ','<').replace(' >','>')
     text = text.replace('):', '<sadface>').replace('(:', '<smile>')
     text = text.replace(" 't", "t")#.replace("#", "")
     return ' '.join(text.split())
예제 #4
0
def tweet_preprocess(tweet):
    text = twitter_pr.tokenize(tweet)
    split = text.split()
    tweet = [word2id.get(word, import_data.UNKNOWN_INDEX) for word in split]
    
    tweet_len = len(tweet)

    # sequence.pad_sequences expects ndarray
    tweet = sequence.pad_sequences([tweet],
        maxlen=TWEET_LENGTH,
        truncating='post',
        padding='post',
        value=import_data.PAD_INDEX)
		
    # need to remove extra dimension we put    
    return tweet[0], tweet_len
예제 #5
0
def proc_file(in_filename, out_filename, glove_vocab):
    infile = open(in_filename)
    reader = csv.reader(infile, quotechar='"')
    outfile = open(out_filename, 'w')
    writer = csv.writer(outfile, quotechar='"')
    tumblr_vocab = set()
    tokenizer = TweetTokenizer()
    for i,line in enumerate(reader):
        if len(line) != 10:
            continue
        caption = tokenizer.tokenize(tokenize(line[8]))
        num_in = len([x for x in caption if x in glove_vocab])
        for word in caption:
            tumblr_vocab.add(word)
        if num_in * 2 > len(caption):
            writer.writerow(line)
    print "Tumblr vocab size:", len(tumblr_vocab)
    print "Overlap:", len(glove_vocab & tumblr_vocab)
 def tokenise_tweet(text):
     text = preprocess_twitter.tokenize(text)
     text = preprocess_tweet(text)
     return ' '.join(text.split())
예제 #7
0
    with open("num_of_tweets.txt", 'w') as f:
        f.write(str(num_dem_tweets))
        f.write('\n')
        f.write(str(num_rep_tweets))
        f.close()

else:
    with open("num_of_tweets.txt", 'w') as f:
        f.write(str(num_dem_tweets))
        f.write('\n')
        f.write(str(num_rep_tweets))
        f.close()

# if num_rep_tweets - old_num_rep_tweets > 10000000:
date = datetime.now().strftime('%Y_%m_%d')
cur2 = conn.cursor('repcur')
cur2.execute('''SELECT content FROM tweetstest WHERE party = False''')
with open('rep_tweets_{}.txt'.format(date), 'w') as f:
    for record in cur2:
        f.write(pre.tokenize(record[0]) + '\n')
    f.close()

# if num_dem_tweets - int(old_num_dem_tweets) > 10000000:
date = datetime.now().strftime('%Y_%m_%d')
cur3 = conn.cursor('demcur')
cur3.execute('''SELECT content FROM tweetstest WHERE party = True''')
with open('dem_tweets_{}.txt'.format(date), 'w') as f:
    for record in cur3:
        f.write(pre.tokenize(record[0] + '\n'))
    f.close()
예제 #8
0
model = KeyedVectors.load_word2vec_format(
    "./glove.twitter.27B/word2vec200d.txt", binary=False)

######  TF-IDF #####
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfVectorizer = TfidfVectorizer(encoding='latin-1',
                                  vocabulary=model.wv.vocab.keys(),
                                  lowercase=True)
tfidf = tfidfVectorizer.fit_transform(data)
#####################

# Creating a representation for the whole tweet using Glove wordvec
import preprocess_twitter as stanfordPreprocessing
for i, tweet in enumerate(data):

    tweet = stanfordPreprocessing.tokenize(tweet).split()
    #Without TF_IDF
    #features.append(buildTwitterVector(tweet,model,size=200))

    #With TF_IDF - do not remove punctuation since Glove was trained with it
    features.append(
        buildTwitterVectorTFIDF(tweet,
                                model,
                                tfidfVectorizer,
                                tfidf.getrow(i).toarray(),
                                size=200))

result = cross_validate(LogisticRegression(penalty='l2'),
                        X=features,
                        y=sentiment,
                        cv=5,
예제 #9
0
import re
from scipy.stats import itemfreq
import pandas as pd
import fasttext as ft
import numpy as np
import sys

ds_name = sys.argv[1]
ds_filename = './data/{}_cleaned.txt'.format(ds_name)
model_filename = './model/{}_cleaned.bin'.format(ds_name)
stopset = set(stopwords.words('english'))

loaded_ds = [(x[0], x[1]) for x in utils.load_ds(ds_filename, True)]

labels = [x[0] for x in loaded_ds]
tweets = [tokenize(x[1]) for x in loaded_ds]

tweets = [
    ' '.join([w for w in tw.split() if w not in stopset]) for tw in tweets
]
tweets = [
    ' '.join([w for w in tw.split() if not w.startswith('<')]) for tw in tweets
]

r = re.compile('[^a-z\s]')
tweets = [r.sub('', tw).strip() for tw in tweets]

with open('./output/{}_w2v_tweets.txt'.format(ds_name), 'w') as f:
    for i in range(len(tweets)):
        f.write('{}{}\n'.format(labels[i], tweets[i]))
예제 #10
0
def run_model(q, stop, predict_fn, word2id, red):
    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
    logging.info("Loaded... listening on channel")
    while (not stop.is_set()):
        try:
            hashtags = 0
            urls = 0
            ids = []

            message = q.get_nowait()
            content = message['content'].decode("utf-8")
            channel = message['channel'].decode("utf-8")
            split = pt.tokenize(content)
            split = split.split()

            for word in split:
                index = word2id.get(word, i_data.UNKNOWN_INDEX)
                if (index == i_data.HASHTAG_INDEX):
                    hashtags += 1
                elif (index == i_data.URL_INDEX):
                    urls += 1
                ids.append(index)

            length = len(ids)
            # extra dimension is for feeding through sequence.pad_sequence
            ids = [ids]
            ids = sequence.pad_sequences(ids,
                                         maxlen=50,
                                         truncating='post',
                                         padding='post',
                                         value=i_data.PAD_INDEX)
            # remove extra dimension
            ids = ids[0]

            # favorite_count, num_hashtags, num_urls, reply_count
            attributes = [0, hashtags, urls, 0]

            # model expects dict below
            inputs = {'text': ids, 'len': length, 'attributes': attributes}

            predictions = predict_fn(inputs)

            logging.info(
                "output class: %d \n user propability: %f \n bot probability %f",
                predictions['pred_output_classes'][0],
                predictions['probabilities'][0][0],
                predictions['probabilities'][0][1])
            # Predictions: First array holds percentages of confidence of class
            # class at index 0 is a human
            # class at index 1 is a bot

            resp = {}
            resp['class'] = int(predictions['pred_output_classes']
                                [0])  # .item() converts to native python type
            resp['percentage0'] = float(predictions['probabilities'][0][0])
            resp['percentage1'] = float(predictions['probabilities'][0][1])

            channel = channel.split("-")
            channel = CHANNEL_REPLY + channel[1]
            red.publish(channel, json.dumps(resp))
        # when there is nothing in the queue continue to loop
        except queue.Empty:
            continue