예제 #1
0
from preprocess_tweets import tweet_preprocessing
import random
import os
import json

base_path = '../../../../datasets/HateSPic/twitter/json_all/'
out_path = '../../../../datasets/HateSPic/lstm_data/twitter/'

out_file = open(out_path + 'tweets.test', 'w')

for file in os.listdir(base_path):
    print file
    info = json.load(open(base_path + file))
    try:
        text = tweet_preprocessing(info['text'].encode('utf-8'))
        # Discard short tweets
        if len(text) < 5: continue
        if len(text.split(' ')) < 3: continue
        text = text.strip('\r').strip('\n')
        out_file.write(str(info['id']) + ',' + text +'\n')

    except:
        print("Error with file: " + file)
        continue

print "DONE"
예제 #2
0
                if i == 0:
                    if len(line.split(',')) < 6:
                        #print("Continuing: " + line)
                        continue
                    text = ' '.join(line.split(',')[6:])
                    label_id = int(line.split(',')[5])

                if i in [1, 2]:
                    if len(line.split(',')) < 2:
                        #print("Continuing: " + line)
                        continue
                    text = ' '.join(line.split(',')[1:])
                    label_id = int(line.split(',')[0])

                text = tweet_preprocessing(text)

                # Discard short tweets
                if len(text) < 5: continue
                if len(text.split(' ')) < 3: continue

                if label_id == 0:
                    if random.randint(0, 10) == 0:
                        hate_file_val.write(text)
                    else:
                        hate_file.write(text)

                if label_id == 2:
                    if random.randint(0, 62) == 0:
                        nothate_file_val.write(text)
                    else:
예제 #3
0
data = json.load(open(data_path, 'r'))

train = []
val = []
test = []

print("Generating lstm data")
for k, v in data.iteritems():
    total_hate = 0
    for label in v['labels']:
        if label > 0:
            total_hate += 1

    label = total_hate / 3.0

    text = tweet_preprocessing(v['tweet_text'].encode('utf-8'))

    split_selector = random.randint(1, 10)

    if split_selector > 8:
        val.append(str(k) + ',' + text + ',' + str(label) + '\n')
    elif split_selector > 7:
        test.append(str(k) + ',' + text + ',' + str(label) + '\n')
    else:
        train.append(str(k) + ',' + text + ',' + str(label) + '\n')

for l in train:
    out_file_train.write(l)
for l in val:
    out_file_val.write(l)
for l in test:
예제 #4
0
from preprocess_tweets import tweet_preprocessing
import os
import json

base_path = '../../../../datasets/HateSPic/MMHS/img_txt/'
out_path = '../../../../datasets/HateSPic/MMHS/lstm_data/'
out_file = open(out_path + 'tweets.img_txt', 'w')

for file in os.listdir(base_path):
    print(file)
    img_text = json.load(open(base_path + file))['img_text']
    try:
        text = tweet_preprocessing(
            img_text.encode('utf-8').replace('\n', ' ').replace('\r', ''))
        # Discard short tweets
        # if len(text) < 5: continue
        # if len(text.split(' ')) < 3: continue
        out_file.write(file.strip('.json') + ',' + text + '\n')

    except:
        print("Error with file: " + file)
        continue

print("DONE")