def preprocess(tweet): abbv_dict = json.load(open("../other/abbreviations.json")) emo_lexica_dict = json.load(open("../other/emoticons.json")) for emoticon in emo_lexica_dict[u'emoticons']: abbv_dict[emoticon] = ' ' for word in emo_lexica_dict[u'words']: abbv_dict[word] = ' ' hash_transformer = Transformer.HashtagTransformer() sub_transformer = Transformer.SubstitutionTransformer(abbv_dict) preprocessor = Preprocessor([hash_transformer, sub_transformer]) tweet = ' '.join(tokenize(tweet)) tweet = preprocessor.transform(tweet) return tweet
for tweet in non_emo_tweets: nonemotiontweets.append(self.transform(tweet)) return emotiontweets, nonemotiontweets if __name__ == '__main__': emos = ['happy', 'angry', 'disgust', 'sad', 'surprise'] abbv_dict = json.load(open("../other/abbreviations.json")) emo_lexica_dict = json.load(open("../other/emoticons.json")) # Add emoticons from emotions.json for emoticon in emo_lexica_dict[u'emoticons']: abbv_dict[emoticon] = ' ' # Add words from emotions.json for word in emo_lexica_dict[u'words']: abbv_dict[word] = ' ' hash_transformer = Transformer.HashtagTransformer() sub_transformer = Transformer.SubstitutionTransformer(abbv_dict) preprocessor = Preprocessor([hash_transformer, sub_transformer]) parser = ArgumentParser() parser.add_argument("--inputdir", dest="source_dir", help="input directory", default="../data/input") parser.add_argument("--outputdir", dest="output_dir", help="output directory", default="../data/preprocessed") args = parser.parse_args() files = [f for f in os.listdir(args.source_dir) if f.endswith('.json')]