def execute_all_term_functions(self, index, number_word_frequency_results=10):
    current_max_sentence_size = 0
    count_word_frequency = Counter()
    res = es.iterate_search(index_name=index)
    for entry in res:
        #Step 1. Get the max sentence size as we go.
        print (entry)
        current_tweet = preprocessor.preprocess(entry['_source']['text'])
        if (len(current_tweet) > current_max_sentence_size):
            current_max_sentence_size = len(current_tweet)

        #Step 2. Count the number of words in the frequency.
        terms_all = [term for term in preprocessor.preprocess(entry['_source']['text']) if term not in stop]
        # Update the counter
        count_word_frequency.update(terms_all)
    dict = {"word_frequency": count_word_frequency.most_common(number_word_frequency_results),
                                                                   "max_sentence_size": current_max_sentence_size}
    return dict


    def max_tweet_sentence_size(self,filename):
        #TODO need to add new function to support elasticsearch first
        return -1

    def count_word_frequency(self, filename):
        return -1

    def most_common_words(self, num_results, filename):
        #TODO need to add new function to support elasticsearch first
        return -1
Exemplo n.º 2
0
def test():
    texts = []
    res = elastic_utils.iterate_search(
        index_name=cfg.twitter_credentials['topic'])
    for i in res:
        processed_text = preprocessor.preprocess(i['_source']['text'])
        processed_text = preprocessor.remove_stop_words(
            processed_text)  #remove stop words
        processed_text = preprocessor.remove_urls(processed_text)  #remove urls
        processed_text = preprocessor.remove_ats(
            processed_text)  #remove username requests
        processed_text = preprocessor.remove_hashtags(
            processed_text)  #remove hashtags? #TODO this might be useful
        texts.append(processed_text)
    doc_2_vec = testlda.run(texts)
Exemplo n.º 3
0
def count_words(number_word_frequency_results, list_in_question):
    nltk.download('stopwords')

    punctuation = list(string.punctuation)
    stop = stopwords.words('english') + punctuation + [
        'rt', 'via', '…', 'I', '’', 'The', '!'
    ]
    count_word_frequency = Counter()
    for entry in list_in_question:
        print("-----------")
        print(entry)
        print(type(entry))
        print("-----------")
        terms_all = [
            term for term in preprocessor.preprocess(entry) if term not in stop
        ]
        count_word_frequency.update(terms_all)
    return count_word_frequency.most_common(number_word_frequency_results)
Exemplo n.º 4
0
def twitteruser_suggest(
        request, template_name='fyp/twitteruser/twitteruser_suggest.html'):
    if request.method == 'POST':
        if 'twitteruser-form' in request.POST:
            all_tweets = []
            all_text = []
            user_list = request.POST.getlist('suggest-user')
            for user in user_list:
                all_tweets.extend(
                    collect_user_tweets.get_all_users_tweets(user))
            count_word_frequency = Counter()
            for tweet in all_tweets:
                text = preprocessor.preprocess(str(tweet.text))
                text = preprocessor.remove_stop_words(text)
                text = preprocessor.remove_ats(text)
                text = preprocessor.remove_hashtags(text)
                text = preprocessor.remove_urls(text)
                text = [i for i in text if len(i) > 2]
                all_text.extend(text)
                terms_all = [term for term in text]
                count_word_frequency.update(terms_all)
            suggestions = count_word_frequency.most_common(25)
            print(suggestions)
            cat = TwitterUser.objects.filter(user=request.user)
            data = {}
            data['object_list'] = cat
            return render(request, template_name, {
                'suggestions': suggestions,
                'object_list': data['object_list']
            })
        if 'suggestcat-form' in request.POST:
            print(request.POST)
            category_list = request.POST.getlist('suggest-category')
            for category in category_list:
                category = ''.join(c for c in category if c not in '()\',')
                entry = TwitterCat(user=request.user, category_name=category)
                entry.save()
                elastic_utils.create_index(category)
                collect_old_tweets.delay(category, 30)

    cat = TwitterUser.objects.filter(user=request.user)
    data = {}
    data['object_list'] = cat
    return render(request, template_name, data)
def word_cloud(id, topic):
    """The word cloud task creates a word cloud from the data."""
    item = {}
    category = []
    cat = TwitterCat.objects.filter(user_id=id)
    for entry in cat:
        entry = preprocessor.preprocess(entry.category_name)
        entry = preprocessor.porter_stemming(entry)
        entry = ''.join(c for c in entry if c not in '[]\'')
        res = (elastic_utils.search_index(
            topic,
            query='{"query":{"query_string":{"fields":["text"],"query":"%s*"}}}'
            % str(entry)))
        total = res['hits']['total']
        item[entry] = total
        category.append(entry)
        current_task.update_state(state='PROGRESS',
                                  meta={
                                      'current_categories': category,
                                      'current_results': item
                                  })
    jsonData = json.dumps(item)
    return (category, jsonData)