def createBigramTrigram(infile, outfile, type):

    #read file with loader module
    loader = Loader()
    items = loader.read_file(infile)

    plt = []

    #create bigram or trigram
    for tweet in items:
        if type == 'b':
            plt.append(bigramas(tweet['text']))
        else:
            plt.append(trigramas(tweet['text']))

    #Pass all strings list to a single string
    palavras = ' '
    for i in range(len(plt)):
        for j in range(len(plt[i])):
            palavras += plt[i][j].replace('\n', ' ').replace('\t', '') + ' '

    #Formatting
    count = {}
    for word in palavras.split(" "):
        if len(word) < 0:
            continue
        if word not in count:
            count[word] = 0
        count[word] += 1

    l = sorted(count.items(), key=lambda x: -x[1])

    write_file(infile, outfile, l)
示例#2
0
def createDict(infile):

    #read file with loader module
    loader = Loader()
    items = loader.read_file(infile)

    #create dict()
    dic = collections.defaultdict(list)

    for tweet in items:
        aux = tweet['user_name']
        #Check if key exists in dictionary using in operator
        if aux in dic:
            dic[aux] = dic[aux] + 1
        else:
            dic[aux] = 1

    #sorted dictionary
    list_x = sorted(dic.items(), key=lambda kv: kv[1], reverse=True)
    return list_x
示例#3
0
def sanitize(infile, outfile, stopwords, emoji, rt):
    #initialize cleaner and load stopwords

    cleaner = TweetCleaner()
    stopwords = cleaner.load_stopwords(stopwords)

    #read file with loader module
    loader = Loader()
    items = loader.read_file(infile)

    #remove stopwords and emoji from tweets
    for tweet in items:
        tweet['text'] = cleaner.standardize_quotes(tweet['text'])
        tweet['text'] = cleaner.clean_apostrophe_s(tweet['text'])
        tweet['text'] = cleaner.remove_urls(tweet['text'])
        tweet['text'] = cleaner.remove_symbols(tweet['text'])
        tweet['text'] = cleaner.remove_stopwords(tweet['text'], stopwords)
        if not emoji:
            tweet['text'] = cleaner.remove_emoji(tweet['text'])
        if rt:
            cleaner.remove_rts(items, tweet)

    write_file(infile, outfile, items)
示例#4
0
def report(infile, outfile, displaycount):
    #initialize cleaner and load stopwords
    cleaner = TweetCleaner()
    stopwords = cleaner.load_stopwords(['stopwords/stopwords_en.txt', 'stopwords/stopwords_pt-br.txt'])

    #read file with loader module
    print('Reading file. This may take a while...')
    loader = Loader()
    items = loader.read_file(infile)
    print('File read successfully!\nProcessing the summary...')

    if 'text' not in items[0]:
        print("Warning: 'text' key is required.\nTerminating...")
        sys.exit(0)

    tweet_count = len(items)

    summary = "File name: " + infile + '\n'
    summary += "Tweet count: " + str(tweet_count) + "\n\n"

    if 'created_at' in items[0]:
        #created_at exists
        date_upper = items[0]['created_at']
        date_lower = items[tweet_count - 1]['created_at']

        summary += "Most recent tweet: " + date_upper + "\n"
        summary += "Oldest tweet: " + date_lower + "\n"
    elif 'date' in items[0]:
        date_upper = items[0]['date']
        date_lower = items[tweet_count - 1]['date']

        summary += "Most recent tweet: " + date_upper + "\n"
        summary += "Oldest tweet: " + date_lower + "\n"
    else:
          summary += "Warning: 'created_at' or 'date' key does not exist. Date range information cannot be fetched."

    username_key = get_username_key(items[0])

    if 'retweets' in items[0]:
        summary+='\nTop retweeted tweets:\n'
        cont = 0
        for tweet in sorted(items, reverse=True, key = lambda i: i['retweets']):
            if 'RT @' not in tweet['text'] and cont < displaycount:
                summary+= format_print_tweet(tweet, username_key)
                cont+=1
            if cont>=10:
                break


    word_list = []
    hashtag_list = []
    user_list = []

    for tweet in items:
        tweet['text'] = cleaner.standardize_quotes(tweet['text'])
        tweet['text'] = cleaner.clean_apostrophe_s(tweet['text'])
        tweet['text'] = cleaner.remove_urls(tweet['text'])
        tweet['text'] = cleaner.remove_symbols(tweet['text'])
        tweet['text'] = cleaner.remove_stopwords(tweet['text'], stopwords)
        tweet['text'] = cleaner.remove_emoji(tweet['text'])
        tweet['text'] = tweet['text'].lower()

    for tweet in items:
        #print(re.findall(r'#\w+', tweet['text']))
        hashtag_list += re.findall(r'#\w+', tweet['text'])
        user_list += re.findall(r'@\w+', tweet['text'])
        word_list += re.findall(r'\b\w+', tweet['text'])


    word_dict = {}
    hashtag_dict = {}
    user_dict = {}

    for hashtag in hashtag_list:
        if hashtag in hashtag_dict:
            hashtag_dict[hashtag] += 1
        else:
            hashtag_dict[hashtag] = 1

    for user in user_list:
        if user in user_dict:
            user_dict[user] += 1
        else:
            user_dict[user] = 1

    for word in word_list:
        if word in word_dict:
            word_dict[word] += 1
        else:
            word_dict[word] = 1


    summary+='\n\nWord ranking:\n\n'
    count = 0
    for key, value in sorted(list(word_dict.items()), reverse=True, key=lambda k_v: (k_v[1],k_v[0])):
         if count < displaycount:
             summary+= '\t%s: %s\n' % (key, value)
         count +=1

    summary+='\nUser ranking:\n\n'
    count = 0
    for key, value in sorted(list(user_dict.items()), reverse=True, key=lambda k_v: (k_v[1],k_v[0])):
         if count < displaycount:
             summary+= '\t%s: %s\n' % (key, value)
         count +=1


    count = 0
    summary+='\nHashtag ranking:\n\n'
    for key, value in sorted(list(hashtag_dict.items()), reverse=True, key=lambda k_v: (k_v[1],k_v[0])):
         if count < displaycount:
             summary+= '\t%s: %s\n' % (key, value)
         count +=1

    with open(outfile, 'w', encoding='utf8') as f:
        f.write(summary)

    print('Succesfully wrote file to ' + outfile + '!')