from preprocess_tweets import tweet_preprocessing import random import os import json base_path = '../../../../datasets/HateSPic/twitter/json_all/' out_path = '../../../../datasets/HateSPic/lstm_data/twitter/' out_file = open(out_path + 'tweets.test', 'w') for file in os.listdir(base_path): print file info = json.load(open(base_path + file)) try: text = tweet_preprocessing(info['text'].encode('utf-8')) # Discard short tweets if len(text) < 5: continue if len(text.split(' ')) < 3: continue text = text.strip('\r').strip('\n') out_file.write(str(info['id']) + ',' + text +'\n') except: print("Error with file: " + file) continue print "DONE"
if i == 0: if len(line.split(',')) < 6: #print("Continuing: " + line) continue text = ' '.join(line.split(',')[6:]) label_id = int(line.split(',')[5]) if i in [1, 2]: if len(line.split(',')) < 2: #print("Continuing: " + line) continue text = ' '.join(line.split(',')[1:]) label_id = int(line.split(',')[0]) text = tweet_preprocessing(text) # Discard short tweets if len(text) < 5: continue if len(text.split(' ')) < 3: continue if label_id == 0: if random.randint(0, 10) == 0: hate_file_val.write(text) else: hate_file.write(text) if label_id == 2: if random.randint(0, 62) == 0: nothate_file_val.write(text) else:
data = json.load(open(data_path, 'r')) train = [] val = [] test = [] print("Generating lstm data") for k, v in data.iteritems(): total_hate = 0 for label in v['labels']: if label > 0: total_hate += 1 label = total_hate / 3.0 text = tweet_preprocessing(v['tweet_text'].encode('utf-8')) split_selector = random.randint(1, 10) if split_selector > 8: val.append(str(k) + ',' + text + ',' + str(label) + '\n') elif split_selector > 7: test.append(str(k) + ',' + text + ',' + str(label) + '\n') else: train.append(str(k) + ',' + text + ',' + str(label) + '\n') for l in train: out_file_train.write(l) for l in val: out_file_val.write(l) for l in test:
from preprocess_tweets import tweet_preprocessing import os import json base_path = '../../../../datasets/HateSPic/MMHS/img_txt/' out_path = '../../../../datasets/HateSPic/MMHS/lstm_data/' out_file = open(out_path + 'tweets.img_txt', 'w') for file in os.listdir(base_path): print(file) img_text = json.load(open(base_path + file))['img_text'] try: text = tweet_preprocessing( img_text.encode('utf-8').replace('\n', ' ').replace('\r', '')) # Discard short tweets # if len(text) < 5: continue # if len(text.split(' ')) < 3: continue out_file.write(file.strip('.json') + ',' + text + '\n') except: print("Error with file: " + file) continue print("DONE")