Пример #1
0
def preprocess(tweet):
    abbv_dict = json.load(open("../other/abbreviations.json"))
    emo_lexica_dict = json.load(open("../other/emoticons.json"))
    for emoticon in emo_lexica_dict[u'emoticons']:
        abbv_dict[emoticon] = ' '
    for word in emo_lexica_dict[u'words']:
        abbv_dict[word] = ' '
    hash_transformer = Transformer.HashtagTransformer()
    sub_transformer = Transformer.SubstitutionTransformer(abbv_dict)
    preprocessor = Preprocessor([hash_transformer, sub_transformer])
    tweet = ' '.join(tokenize(tweet))
    tweet = preprocessor.transform(tweet)
    return tweet
Пример #2
0
        for tweet in non_emo_tweets:
            nonemotiontweets.append(self.transform(tweet))
        return emotiontweets, nonemotiontweets


if __name__ == '__main__':
    emos = ['happy', 'angry', 'disgust', 'sad', 'surprise']
    abbv_dict = json.load(open("../other/abbreviations.json"))
    emo_lexica_dict = json.load(open("../other/emoticons.json"))
    # Add emoticons from emotions.json
    for emoticon in emo_lexica_dict[u'emoticons']:
        abbv_dict[emoticon] = ' '
    # Add words from emotions.json
    for word in emo_lexica_dict[u'words']:
        abbv_dict[word] = ' '
    hash_transformer = Transformer.HashtagTransformer()
    sub_transformer = Transformer.SubstitutionTransformer(abbv_dict)
    preprocessor = Preprocessor([hash_transformer, sub_transformer])

    parser = ArgumentParser()
    parser.add_argument("--inputdir",
                        dest="source_dir",
                        help="input directory",
                        default="../data/input")
    parser.add_argument("--outputdir",
                        dest="output_dir",
                        help="output directory",
                        default="../data/preprocessed")
    args = parser.parse_args()

    files = [f for f in os.listdir(args.source_dir) if f.endswith('.json')]