def collect_todays_tweets(entry):
    """Collects todays tweets for every topic."""
    count_word_frequency = Counter()
    word_counter = Counter()
    hour_break_dict = {}
    if ("-latest") not in entry:
        if ("median") not in entry:
            # we frst need to collect all todays tweets
            entry_total = elastic_utils.last_id(entry)
            if elastic_utils.check_index_exists(entry + "-latest") is True:
                total = elastic_utils.last_id(entry + "-latest")
                day_res = elastic_utils.iterate_search(entry + "-latest",
                                                       query={
                                                           "query": {
                                                               "match_all": {}
                                                           },
                                                           "sort": [{
                                                               "last_time": {
                                                                   "order":
                                                                   "desc"
                                                               }
                                                           }]
                                                       })
                for test in day_res:
                    time_of_tweet = test["_source"]["created"]
                    datetime_object = datetime.strptime(
                        time_of_tweet, '%Y-%m-%d %H:%M:%S')
                    dateobj = datetime_object.strftime("%Y-%m-%d")
                    created_at = datetime_object.strftime("%Y-%m-%dT%H:%M:%S")
                    count_word_frequency.update(str(datetime_object.hour))
                    if str(datetime_object.hour) in hour_break_dict:
                        hour_break_dict[str(datetime_object.hour)] += 1
                    else:
                        hour_break_dict[str(datetime_object.hour)] = 1

                    words = preprocessor.filter_multiple(str(
                        test["_source"]["text"]),
                                                         ats=True,
                                                         hashtags=True,
                                                         stopwords=True,
                                                         stemming=False,
                                                         urls=True,
                                                         singles=True)
                    terms_all = [term for term in words]
                    word_counter.update(terms_all)
                    freq_obj = {
                        "hour_breakdown": hour_break_dict,
                        "words": json.dumps(word_counter.most_common(400)),
                        "total": total,
                        "date": dateobj,
                        "last_time": created_at
                    }
                    elastic_utils.add_entry(entry, entry_total + 1, freq_obj)
                    elastic_utils.delete_index(entry + "-latest")
                try:
                    elastic_utils.create_index(entry + "-latest")
                except:
                    print(
                        "Todays index already exists! This is an exception, but it's probably ok"
                    )
예제 #2
0
 def test_add_entry(self):
     doc = {"name": "test"}
     res = es.add_entry(index_name="test", id=1, body=doc)
     self.assertEqual(True, res['created'])
     res = es.add_entry(index_name="test", id=2, body=doc) #This should also work
     self.assertEqual(True, res['created'])
     res = es.add_entry(index_name="test", id=1, body=doc) #This should fail since its the same id
     self.assertEqual(False, res['created'])
 def test_search_index(self):
     # Add entry first along with the index
     doc = {"name": "test"}
     es.create_index("searching")
     es.add_entry(index_name="searching", id=1, body=doc)
     time.sleep(1)
     res = es.search_index(index_name="searching")
     print(res['hits']['hits'][0]['_source'])
     self.assertIn('test', res['hits']['hits'][0]['_source']['name'])
     es.delete_index("searching")
def aggregate(tweet, topic, start_date):
    data = {}
    current_hour = 23
    current_tweet_count = 0
    common_words = []
    for entry in tweet:
        if entry.date.hour == current_hour:
            current_tweet_count += 1
        else:
            data[current_hour] = current_tweet_count
            current_hour = current_hour - 1
            current_tweet_count = 1
        common_words.append(
            preprocessor.filter_multiple(str(entry.text),
                                         ats=True,
                                         stopwords=True,
                                         stemming=False,
                                         urls=True,
                                         singles=True))
    #TODO if no entries in tweet it won't work properly since it skips the for loop. Captured only for hour 23 in that case.
    count_word_frequency = Counter()
    for entry in common_words:
        terms_all = [term for term in entry]
        count_word_frequency.update(terms_all)
    data[current_hour] = current_tweet_count
    words = count_word_frequency.most_common(400)
    try:
        dict = {
            "date": str(start_date),
            "total": len(tweet),
            "last_time": tweet[len(tweet) - 1].date,
            "hour_breakdown": data,
            'words': json.dumps(words)
        }
    except:
        dict = {
            "date": str(start_date),
            "total": len(tweet),
            "last_time": "No Tweets",
            "hour_breakdown": data,
            'words': json.dumps(words)
        }
    id = elastic_utils.last_id(topic)
    while elastic_utils.check_for_last_id(topic,
                                          query={
                                              "query": {
                                                  "match": {
                                                      "_id": id
                                                  }
                                              },
                                              "size": 1
                                          }) == True:
        id += 1
    elastic_utils.add_entry(topic, id, dict)
    return words
 def test_add_entry(self):
     doc = {"name": "test"}
     res = es.add_entry(index_name="test", id=1, body=doc)
     time.sleep(1)
     self.assertEqual('created', res['result'])
     res = es.add_entry(index_name="test", id=2,
                        body=doc)  #This should also work
     time.sleep(1)
     self.assertEqual('created', res['result'])
     res = es.add_entry(index_name="test", id=1,
                        body=doc)  #This should fail since its the same id
     time.sleep(1)
     self.assertEqual('updated', res['result'])
def aggregate_words(user_id, status):
    """The aggregate_words task adds Tweets to Elasticsearch live from the Celery Queue."""
    cat = TwitterCat.objects.filter(user_id=user_id)
    assigned_cat = False
    for entry in cat:
        if str(entry.category_name) in (status['text'].lower()
                                        or status['name'].lower()):
            print(status['created'])
            topic = entry.category_name + "-latest"
            elastic_utils.create_index(topic)
            assigned_cat = True
            break
    if assigned_cat == False:
        topic = "unknown-latest"
        elastic_utils.create_index(topic)
    id = elastic_utils.last_id(topic)
    id += 1
    elastic_utils.add_entry(topic, id, status)
 def test_delete_entry(self):
     #Add entry first
     doc = {"name": "test"}
     res = es.add_entry(index_name="test", id=1, body=doc)
     #Now Delete
     res = es.delete_entry(index_name="test", id=1)
     self.assertEqual('deleted', res['result'])
     #Now test when it doesn't exist
     self.assertRaises(Exception, es.delete_entry(index_name="test", id=1))
예제 #8
0
    def on_status(self, status):

        if hasattr(status, 'retweeted_status'):
            return  #this filters out retweets
        else:
            global id
            id += 1
            dict = {
                "description": str(status.user.description),
                "loc": str(status.user.location),
                "text": str(status.text),
                "coords": str(status.coordinates),
                "name": str(status.user.screen_name),
                "user_created": str(status.user.created_at),
                "followers": str(status.user.followers_count),
                "id_str": str(status.id_str),
                "created": str(status.created_at),
                "retweets": str(status.retweet_count)
            }
            print(id)
            es.add_entry(cfg.twitter_credentials['topic'], id, dict)
 def test_execute_all(self):
     doc = {"text": "test more than one word. test"}
     res = es.add_entry(index_name="test", id=1, body=doc)
     time.sleep(2)
     result = termsfrequency.execute_all_term_functions(self, index="test")
     self.assertEqual(7, result["max_sentence_size"])