Exemplo n.º 1
0
def preprocess_text(text, entities, fw):
    text = preprocess_text_characters(text)

    hashtagsCount = 0
    symbolsCount = 0
    user_mentionsCount = 0
    urlsCount = 0
    mediaCount = 0
    hashtags = entities.get('hashtags')
    if (hashtags != None):
        hashtagsCount = len(hashtags)
    symbols = entities.get('symbols')
    if (symbols != None):
        symbolsCount = len(symbols)
    user_mentions = entities.get('user_mentions')
    if (user_mentions != None):
        user_mentionsCount = len(user_mentions)
    urls = entities.get('urls')
    if (urls != None):
        urlsCount = len(urls)
    media = entities.get('media')
    if (media != None):
        mediaCount = len(media)
    fw.write('"' + text + '"' + delimiter + '"' + xstr(hashtagsCount) + '"' +
             delimiter + '"' + xstr(symbolsCount) + '"' + delimiter + '"' +
             xstr(user_mentionsCount) + '"' + delimiter + '"' +
             xstr(urlsCount) + '"' + delimiter + '"' + xstr(mediaCount) + '"' +
             delimiter)
    return
Exemplo n.º 2
0
def preprocess_extended_tweet(s, fw):
    text = preprocess_text_characters(s.get('full_text'))

    entities = s.get('entities')
    hashtagsCount = 0
    symbolsCount = 0
    user_mentionsCount = 0
    urlsCount = 0
    mediaCount = 0
    hashtags = entities.get('hashtags')
    if (hashtags != None):
        hashtagsCount = len(hashtags)
    urls = entities.get('urls')
    if (urls != None):
        urlsCount = len(urls)
    user_mentions = entities.get('user_mentions')
    if (user_mentions != None):
        user_mentionsCount = len(user_mentions)
    symbols = entities.get('symbols')
    if (symbols != None):
        symbolsCount = len(symbols)
    media = entities.get('media')
    if (media != None):
        mediaCount = len(media)
    fw.write('"'
             #+xstr(text.translate(non_bmp_map))+
             + text + '"' + delimiter + '"' + xstr(hashtagsCount) + '"' +
             delimiter + '"' + xstr(urlsCount) + '"' + delimiter + '"' +
             xstr(user_mentionsCount) + '"' + delimiter + '"' +
             xstr(symbolsCount) + '"' + delimiter + '"' + xstr(mediaCount) +
             '"' + delimiter)
    return
Exemplo n.º 3
0
def preprocess_text_characters(text):
    text = xstr(text)
    text = text.translate(non_bmp_map)
    text = text.replace(',', 'SNE_COMMA')
    text = text.replace('\n', 'SNE_LF')
    text = text.replace('\"', 'SNE_QUOTE')
    return text
Exemplo n.º 4
0
def preprocess_file(f_input, f_output, onlyOnce):
    count = 0
    file_tweet_count = 0
    start_index = 3435
    max_count = 3439
    global total_tweet_count

    print('--------')
    print('input=' + f_input, ' output=' + f_output)

    f_write = open(f_output, 'a', encoding='utf-8')
    if (onlyOnce):
        preprocess_tweet_header(f_write)
        f_write.write('\n')

    with open(f_input, 'r', newline='\r\n') as f:
        for line in f:
            #if (count < max_count):
            #    if (count >= start_index):
            tweet = json.loads(line)  # load it as Python dict
            # to avoid running into 'limit' responses
            if 'text' in tweet:
                preprocess_tweet(tweet, f_write)
                f_write.write('\n')
                file_tweet_count = file_tweet_count + 1
            count = count + 1
    f_write.close()

    total_tweet_count = total_tweet_count + file_tweet_count
    print('file tweet count=' + xstr(file_tweet_count))
    return
Exemplo n.º 5
0
def preprocess_text_header(fw):
    fw.write('"' + xstr('text_text') + '"' + delimiter + '"' +
             xstr('text_hashtags_count') + '"' + delimiter + '"' +
             xstr('text_symbols_count') + '"' + delimiter + '"' +
             xstr('text_user_mentions_count') + '"' + delimiter + '"' +
             xstr('text_urls_count') + '"' + delimiter + '"' +
             xstr('text_media_count') + '"' + delimiter)
    return
Exemplo n.º 6
0
def preprocess_user(s, fw):
    name = preprocess_text_characters(s.get('name'))
    location = preprocess_text_characters(s.get('location'))
    description = preprocess_text_characters(s.get('description'))

    fw.write('"' + xstr(s.get('id_str')) + '"' + delimiter + '"' + name + '"' +
             delimiter + '"' + xstr(s.get('screen_name')) + '"' + delimiter +
             '"' + location + '"' + delimiter + '"' +
             xstr(s.get('profile_location')) + '"' + delimiter + '"' +
             description + '"' + delimiter + '"' + xstr(s.get('url')) + '"' +
             delimiter + '"' + xstr(s.get('entities')) + '"' + delimiter +
             '"' + xstr(s.get('protected')) + '"' + delimiter + '"' +
             xstr(s.get('followers_count')) + '"' + delimiter + '"' +
             xstr(s.get('friends_count')) + '"' + delimiter + '"' +
             xstr(s.get('listed_count')) + '"' + delimiter + '"' +
             xstr(s.get('created_at')) + '"' + delimiter + '"' +
             xstr(s.get('favorites_count')) + '"' + delimiter + '"' +
             xstr(s.get('utc_offset')) + '"' + delimiter + '"' +
             xstr(s.get('time_zone')) + '"' + delimiter + '"' +
             xstr(s.get('geo_enabled')) + '"' + delimiter + '"' +
             xstr(s.get('verified')) + '"' + delimiter + '"' +
             xstr(s.get('statuses_count')) + '"' + delimiter + '"' +
             xstr(s.get('lang')) + '"' + delimiter + '"' +
             xstr(s.get('contributers_enabled')) + '"' + delimiter + '"' +
             xstr(s.get('profile_image_url')) + '"' + delimiter + '"' +
             xstr(s.get('default_profile')) + '"' + delimiter + '"' +
             xstr(s.get('follow_request_sent')) + '"' + delimiter)
    return
Exemplo n.º 7
0
def preprocess_tweet(s, fw):
    truncated = xstr(s.get('truncated'))
    retweeted_status = xstr(s.get('retweeted_status'))
    quoted_status = xstr(s.get('quoted_status'))

    source = xstr(s.get('source'))
    if (source != ""):
        start = source.index(">", 0)
        end = source.index("<", start + 1)
        source = source[start + 1:end]
        source = preprocess_text_characters(source)

    place_name = ""
    place_full_name = ""
    place_country_code = ""
    place_country = ""
    place = s.get('place')
    if (place != None):
        place_name = xstr(place.get('name'))
        place_full_name = xstr(place.get('full_name'))
        place_country_code = xstr(place.get('country_code'))
        place_country = xstr(place.get('country'))

    fw.write('"' + xstr(s.get('created_at')) + '"' + delimiter + '"' +
             xstr(s.get('id_str')) + '"' + delimiter + '"' + source + '"' +
             delimiter + '"' + xstr(s.get('in_reply_to_status_id')) + '"' +
             delimiter + '"' + xstr(s.get('in_reply_to_status_id_str')) + '"' +
             delimiter + '"' + xstr(s.get('in_reply_to_user_id')) + '"' +
             delimiter + '"' + xstr(s.get('in_reply_to_user_id_str')) + '"' +
             delimiter + '"' + xstr(s.get('in_reply_to_screen_name')) + '"' +
             delimiter + '"' + place_name + '"' + delimiter + '"' +
             place_full_name + '"' + delimiter + '"' + place_country_code +
             '"' + delimiter + '"' + place_country + '"' + delimiter + '"' +
             xstr(s.get('is_quote_status')) + '"' + delimiter + '"' +
             xstr(s.get('retweet_count')) + '"' + delimiter + '"' +
             xstr(s.get('favorite_count')) + '"' + delimiter + '"' +
             xstr(s.get('favorited')) + '"' + delimiter + '"' +
             xstr(s.get('retweeted')) + '"' + delimiter + '"' +
             xstr(s.get('possibly_sensitive')) + '"' + delimiter + '"' +
             xstr(s.get('lang')) + '"' + delimiter + '"' +
             xstr(s.get('truncated')) + '"' + delimiter + '"' +
             xstr(s.get('quoted_status_id_str')) + '"' + delimiter)

    #preprocess_coordinates(s['coordinates'], fw)

    preprocess_user(s['user'], fw)

    if (truncated == 'False'):
        preprocess_text(s['text'], s['entities'], fw)
    else:
        preprocess_extended_tweet(s['extended_tweet'], fw)

    #if (retweeted_status != ""):
    #    print("In Retweeted Status")
    #    preprocess_tweet(s['retweeted_status'], fw)

    #if (quoted_status != ""):
    #    print("In Quoted Status")
    #    preprocess_tweet(s['quoted_status'], fw)
    return
Exemplo n.º 8
0
def preprocess_user_header(fw):
    fw.write('"' + xstr('user_id_str') + '"' + delimiter + '"' +
             xstr('user_name') + '"' + delimiter + '"' +
             xstr('user_screen_name') + '"' + delimiter + '"' +
             xstr('user_location') + '"' + delimiter + '"' +
             xstr('user_profile_location') + '"' + delimiter + '"' +
             xstr('user_description') + '"' + delimiter + '"' +
             xstr('user_url') + '"' + delimiter + '"' + xstr('user_entities') +
             '"' + delimiter + '"' + xstr('user_protected') + '"' + delimiter +
             '"' + xstr('user_followers_count') + '"' + delimiter + '"' +
             xstr('user_friends_count') + '"' + delimiter + '"' +
             xstr('user_listed_count') + '"' + delimiter + '"' +
             xstr('user_created_at') + '"' + delimiter + '"' +
             xstr('user_favorites_count') + '"' + delimiter + '"' +
             xstr('user_utc_offset') + '"' + delimiter + '"' +
             xstr('user_time_zone') + '"' + delimiter + '"' +
             xstr('user_geo_enabled') + '"' + delimiter + '"' +
             xstr('user_verified') + '"' + delimiter + '"' +
             xstr('user_statuses_count') + '"' + delimiter + '"' +
             xstr('user_lang') + '"' + delimiter + '"' +
             xstr('user_contributers_enabled') + '"' + delimiter + '"' +
             xstr('user_profile_image_url') + '"' + delimiter + '"' +
             xstr('user_default_profile') + '"' + delimiter + '"' +
             xstr('user_follow_request_sent') + '"' + delimiter)
    return
Exemplo n.º 9
0
def preprocess_tweet_header(fw):
    fw.write('"' + xstr('tweet_created_at') + '"' + delimiter + '"' +
             xstr('tweet_id_str') + '"' + delimiter + '"' +
             xstr('tweet_source') + '"' + delimiter + '"' +
             xstr('tweet_in_reply_to_status_id') + '"' + delimiter + '"' +
             xstr('tweet_in_reply_to_status_id_str') + '"' + delimiter + '"' +
             xstr('tweet_in_reply_to_user_id') + '"' + delimiter + '"' +
             xstr('tweet_in_reply_to_user_id_str') + '"' + delimiter + '"' +
             xstr('tweet_in_reply_to_screen_name') + '"' + delimiter + '"' +
             xstr('tweet_place_name') + '"' + delimiter + '"' +
             xstr('tweet_place_full_name') + '"' + delimiter + '"' +
             xstr('tweet_place_country_code') + '"' + delimiter + '"' +
             xstr('tweet_place_country') + '"' + delimiter + '"' +
             xstr('tweet_is_quote_status') + '"' + delimiter + '"' +
             xstr('tweet_retweet_count') + '"' + delimiter + '"' +
             xstr('tweet_favorite_count') + '"' + delimiter + '"' +
             xstr('tweet_favorited') + '"' + delimiter + '"' +
             xstr('tweet_retweeted') + '"' + delimiter + '"' +
             xstr('tweet_possibly_sensitive') + '"' + delimiter + '"' +
             xstr('tweet_lang') + '"' + delimiter + '"' +
             xstr('tweet_truncated') + '"' + delimiter + '"' +
             xstr('tweet_quoted_status_id_str') + '"' + delimiter)

    #preprocess_coordinates_header(fw)

    preprocess_user_header(fw)

    preprocess_text_header(fw)
    return
Exemplo n.º 10
0
                file_tweet_count = file_tweet_count + 1
            count = count + 1
    f_write.close()

    total_tweet_count = total_tweet_count + file_tweet_count
    print('file tweet count=' + xstr(file_tweet_count))
    return


non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
delimiter = ','
total_tweet_count = 0
onlyOnce = True
filenamePrefix = 'sexual'
filenamePrefix2 = 0
filenamePrefix2_str = ""

files = glob.glob("../data/" + filenamePrefix + "*.json")
for f in files:
    preprocess_file(
        f, filenamePrefix + filenamePrefix2_str + '-tweets-preprocessed.csv',
        onlyOnce)
    onlyOnce = False
    if (int(total_tweet_count / 750000) > filenamePrefix2):
        filenamePrefix2 = int(total_tweet_count / 750000) + 1
        filenamePrefix2_str = xstr(filenamePrefix2)
        onlyOnce = True

print('--------')
print('total tweet count=' + xstr(total_tweet_count))
Exemplo n.º 11
0
def filterFile(type, f_input, f_output):
    fout = open(f_output, 'w', newline='', encoding='utf-8')
    with open(f_input, 'rU', encoding='utf-8') as fin:
        reader = csv.DictReader(fin)

        setMaxCounts(type)

        intersectionCount = 0
        verifiedUserCount = 0
        tweetPlaceCountryCodeCount = 0
        otherCount = 0
        rowCount = 0

        onlyOnce = True

        for row in reader:
            if (onlyOnce):
                writer = csv.DictWriter(fout, fieldnames=row.keys())
                writer.writeheader()
                onlyOnce = False

            text = row['text_text']
            #print('text='+text)
            if (text.startswith('RT')):
                continue

            tweet_lang = row['tweet_lang']
            #print('tweet_lang='+tweet_lang)
            if (not tweet_lang.startswith('en')):
                continue

            text_hashtags_count = row['text_hashtags_count']
            #print('text_hashtags_count='+text_hashtags_count)
            if (int(text_hashtags_count) > 3):
                continue

            text_symbols_count = row['text_symbols_count']
            #print('text_symbols_count='+text_symbols_count)
            if (int(text_symbols_count) > 3):
                continue

            text_user_mentions_count = row['text_user_mentions_count']
            #print('text_user_mentions_count='+text_user_mentions_count)
            if (int(text_user_mentions_count) > 3):
                continue

            text_urls_count = row['text_urls_count']
            #print('text_urls_count='+text_urls_count)
            if (int(text_urls_count) > 3):
                continue

            text_media_count = row['text_media_count']
            #print('text_media_count='+text_media_count)
            if (int(text_media_count) > 3):
                continue

            user_followers_count = row['user_followers_count']
            #print('user_followers_count='+user_followers_count)
            if (int(user_followers_count) < 10):
                continue

            user_friends_count = row['user_friends_count']
            #print('user_friends_count='+user_friends_count)
            if (int(user_friends_count) < 10):
                continue

            user_listed_count = row['user_listed_count']
            #print('user_listed_count='+user_listed_count)
            if (int(user_listed_count) < 5):
                continue

            text = text.replace('SNE_COMMA', ',')
            text = text.replace('SNE_QUOTE', '\"')
            text = text.replace('SNE_LF', '\n')

            # Create a list with all the terms
            terms_all = [term for term in tokenize_text(text, True)]
            #print("terms_all="+xstr(terms_all))

            # Count terms only (no hashtags, no mentions)
            terms_without_hashtags_mentions = [
                term for term in terms_all
                if term not in stop and not term.startswith(('#', '@'))
            ]
            #print("terms_without_hashtags_mentions="+xstr(terms_without_hashtags_mentions))

            # terms only (no URLs)
            terms_without_urls = [
                remove_urls(term) for term in terms_without_hashtags_mentions
            ]
            #print("terms_without_urls="+xstr(terms_without_urls))

            # non empty and more than one character long terms only
            terms_only = [
                term for term in terms_without_urls
                if term != "" and len(term) > 1
            ]
            #print("terms_only="+xstr(terms_only))

            if (len(terms_only) < 5):
                continue

            user_verified = row['user_verified']
            #print('user_verified='+user_verified)

            tweet_place_country_code = row['tweet_place_country_code']
            #print('tweet_place_country_code='+tweet_place_country_code)

            #both need to be valid
            if (user_verified == 'TRUE' and tweet_place_country_code != ""
                    and intersectionCount < intersectionMaxCount):
                intersectionCount = intersectionCount + 1
                rowCount = rowCount + 1
                #write intersection records
                writer.writerow(row)
                continue

            #only verified users
            if (user_verified == 'TRUE' and tweet_place_country_code == ""
                    and verifiedUserCount < verifiedUserMaxCount):
                verifiedUserCount = verifiedUserCount + 1
                rowCount = rowCount + 1
                #write verified user only  records
                writer.writerow(row)
                continue

            #only non-empty tweet_place country code
            if (user_verified == 'FALSE' and tweet_place_country_code != ""
                    and tweetPlaceCountryCodeCount <
                    tweetPlaceCountryCodeMaxCount):
                tweetPlaceCountryCodeCount = tweetPlaceCountryCodeCount + 1
                rowCount = rowCount + 1
                #write non-empty tweet place country code only  records
                writer.writerow(row)
                continue

            if (otherCount < totalMaxCount - intersectionMaxCount -
                    verifiedUserMaxCount - tweetPlaceCountryCodeMaxCount):
                otherCount = otherCount + 1
                rowCount = rowCount + 1
                #write remaining records
                writer.writerow(row)
                continue

            if (rowCount >= totalMaxCount):
                break

        print('-----------')
        print('intersectionMaxCount=' + xstr(intersectionMaxCount))
        print('verifiedUserMaxCount=' + xstr(verifiedUserMaxCount))
        print('tweetPlaceCountryCodeMaxCount=' +
              xstr(tweetPlaceCountryCodeMaxCount))
        print('totalMaxCount=' + xstr(totalMaxCount))

        print('intersectionCount=' + xstr(intersectionCount))
        print('verifiedUserCount=' + xstr(verifiedUserCount))
        print('tweetPlaceCountryCodeCount=' + xstr(tweetPlaceCountryCodeCount))
        print('otherCount=' + xstr(otherCount))
        print('rowCount=' + xstr(rowCount))
        fout.close()
    return
def perform_ml(total_terms, training_data, testing_data, type):
    #print("total_terms="+xstr(total_terms))
    #print("traning_tweets="+xstr(training_data))
    #print("testing_tweets="+xstr(testing_data))

    sentim_analyzer = SentimentAnalyzer()
    all_words = sentim_analyzer.all_words([terms for terms in total_terms])

    # use unigram feats from class specific unigram lists
    unigram_feats = []
    if (type == "nlp_terms"):
        unigram_feats = sentim_analyzer.unigram_word_feats(all_words,
                                                           min_freq=4)
    else:
        unigram_feats = harmful_search_unigrams + other_search_unigrams + physical_search_unigrams + sexual_search_unigrams
    #print("unigram_feats="+xstr(unigram_feats))
    print(str(len(unigram_feats)))

    # use bigram feats from class specific bigram lists
    bigram_feats = []
    if (type == "nlp_terms"):
        bigram_measures = nltk.collocations.BigramAssocMeasures()
        bi_finder = BigramCollocationFinder.from_words(all_words)
        bi_finder.apply_freq_filter(3)
        bigram_feats = bi_finder.nbest(bigram_measures.pmi, -1)
        #bigram_feats = bi_finder.nbest(bigram_measures.pmi, 100)
        #bigram_feats = bi_finder.nbest(bigram_measures.chi_sq, -1)
        #bigram_feats = bi_finder.nbest(bigram_measures.likelihood_ratio, 100)
    else:
        bigram_feats = harmful_search_bigrams + other_search_bigrams + physical_search_bigrams + sexual_search_bigrams
    #print("bigram_feats="+xstr(bigram_feats))
    print(str(len(bigram_feats)))

    sentim_analyzer.add_feat_extractor(extract_unigram_feats,
                                       unigrams=unigram_feats)
    sentim_analyzer.add_feat_extractor(extract_bigram_feats,
                                       bigrams=bigram_feats)

    training_set = sentim_analyzer.apply_features(training_data)
    test_set = sentim_analyzer.apply_features(testing_data)

    #print("training_set="+xstr(training_set))
    print(str(len(training_set)))
    #print("test_set="+xstr(test_set))
    print(str(len(test_set)))

    test_data_only = []
    test_labels_only = []
    for test_data_row in test_set:
        test_data_only.append(test_data_row[0])
        test_labels_only.append(test_data_row[1])

    trainer = NaiveBayesClassifier.train
    classifier = sentim_analyzer.train(trainer, training_set)
    for key, value in sorted(sentim_analyzer.evaluate(test_set).items()):
        print('{0}: {1}'.format(key, value))

    nltk_pred_labels = classifier.classify_many(test_data_only)

    cm = nltk.ConfusionMatrix(test_labels_only, nltk_pred_labels)
    print(cm.pretty_format(sort_by_count=True, show_percents=False,
                           truncate=9))

    informative_features = classifier.show_most_informative_features(25)
    print("Most Informative Features=" + xstr(informative_features))

    return