def collect_todays_tweets(entry): """Collects todays tweets for every topic.""" count_word_frequency = Counter() word_counter = Counter() hour_break_dict = {} if ("-latest") not in entry: if ("median") not in entry: # we frst need to collect all todays tweets entry_total = elastic_utils.last_id(entry) if elastic_utils.check_index_exists(entry + "-latest") is True: total = elastic_utils.last_id(entry + "-latest") day_res = elastic_utils.iterate_search(entry + "-latest", query={ "query": { "match_all": {} }, "sort": [{ "last_time": { "order": "desc" } }] }) for test in day_res: time_of_tweet = test["_source"]["created"] datetime_object = datetime.strptime( time_of_tweet, '%Y-%m-%d %H:%M:%S') dateobj = datetime_object.strftime("%Y-%m-%d") created_at = datetime_object.strftime("%Y-%m-%dT%H:%M:%S") count_word_frequency.update(str(datetime_object.hour)) if str(datetime_object.hour) in hour_break_dict: hour_break_dict[str(datetime_object.hour)] += 1 else: hour_break_dict[str(datetime_object.hour)] = 1 words = preprocessor.filter_multiple(str( test["_source"]["text"]), ats=True, hashtags=True, stopwords=True, stemming=False, urls=True, singles=True) terms_all = [term for term in words] word_counter.update(terms_all) freq_obj = { "hour_breakdown": hour_break_dict, "words": json.dumps(word_counter.most_common(400)), "total": total, "date": dateobj, "last_time": created_at } elastic_utils.add_entry(entry, entry_total + 1, freq_obj) elastic_utils.delete_index(entry + "-latest") try: elastic_utils.create_index(entry + "-latest") except: print( "Todays index already exists! This is an exception, but it's probably ok" )
def execute_all_term_functions(self, index, number_word_frequency_results=10): current_max_sentence_size = 0 count_word_frequency = Counter() res = es.iterate_search(index_name=index) for entry in res: #Step 1. Get the max sentence size as we go. print (entry) current_tweet = preprocessor.preprocess(entry['_source']['text']) if (len(current_tweet) > current_max_sentence_size): current_max_sentence_size = len(current_tweet) #Step 2. Count the number of words in the frequency. terms_all = [term for term in preprocessor.preprocess(entry['_source']['text']) if term not in stop] # Update the counter count_word_frequency.update(terms_all) dict = {"word_frequency": count_word_frequency.most_common(number_word_frequency_results), "max_sentence_size": current_max_sentence_size} return dict def max_tweet_sentence_size(self,filename): #TODO need to add new function to support elasticsearch first return -1 def count_word_frequency(self, filename): return -1 def most_common_words(self, num_results, filename): #TODO need to add new function to support elasticsearch first return -1
def test(): texts = [] res = elastic_utils.iterate_search( index_name=cfg.twitter_credentials['topic']) for i in res: processed_text = preprocessor.preprocess(i['_source']['text']) processed_text = preprocessor.remove_stop_words( processed_text) #remove stop words processed_text = preprocessor.remove_urls(processed_text) #remove urls processed_text = preprocessor.remove_ats( processed_text) #remove username requests processed_text = preprocessor.remove_hashtags( processed_text) #remove hashtags? #TODO this might be useful texts.append(processed_text) doc_2_vec = testlda.run(texts)
def setup_charts(cat): """Sets up the data for the charts on the front end.""" tot = len(cat) entries_arrays = [] i = 0 for mod in cat: current_entry = [] current_entry.append(mod) res = elastic_utils.iterate_search(index_name=mod, query={ "query": { "match_all": {} }, "sort": [{ "last_time": { "order": "desc" } }], "size": 20, }) i += 1 for entry in res: current_entry.append(entry["_source"]["total"]) if i != (tot - 1): current_task.update_state(state='PROGRESS', meta={ 'current_percentage': (i / tot) * 100, 'current_entry': mod, "chart_data": entries_arrays }) else: current_task.update_state(state='PROGRESS', meta={ 'current_percentage': (i / tot) * 100, 'current_entry': mod, "chart_data": entries_arrays, "latest_chart_data": current_entry, "test": 'Finished' }) entries_arrays.append(current_entry) print("task finished.") return entries_arrays
def run_tf_idf(n_clusters, n_init, verbose): max_features = 10000 # Create TF-IDF of texts tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english', max_features=max_features) texts = [] res = elastic_utils.iterate_search( index_name=cfg.twitter_credentials['topic']) for i in res: texts.append(i['_source']['text']) tfidf_vector = tfidf.fit_transform(texts) km = KMeans(n_clusters=2, init='k-means++', n_init=100, verbose=1) km.fit(tfidf_vector) result = {"model": km, "texts": texts} pickle.dump(result, open("save.p", "wb")) return result
def check_index(): """Check index is the main algorithm. It will detect trends in real time. This task runs every 5 minutes.""" index = elastic_utils.list_all_indexes() ts = datetime.now() - timedelta(minutes=5) total_count = 0 for entry in index: word_counter = Counter() if ("-latest") not in entry: if ("median") not in entry: if elastic_utils.check_index_exists(entry + "-latest") is True: total = elastic_utils.last_id(entry + "-latest") # t = datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') day_res = elastic_utils.iterate_search( entry + "-latest", query={ "query": { "match_all": {} }, "sort": [{ "created.keyword": { "order": "desc" } }] }) total_in_five = 0 tweet_list = [] name = [] for item in day_res: time_of_tweet = item["_source"]["created"] datetime_object = datetime.strptime( time_of_tweet, '%Y-%m-%d %H:%M:%S') if datetime_object > ts: if name.count(item["_source"]["name"]) < 3: print("in here") print(name) name.append(item["_source"]["name"]) tweet_list.append(str(item["_source"]["text"])) total_in_five += 1 words = preprocessor.filter_multiple( str(item["_source"]["text"]), ats=True, hashtags=True, stopwords=True, stemming=False, urls=True, singles=True) terms_all = [term for term in words] terms_all = set(terms_all) word_counter.update(terms_all) else: break #stop iterating through every entry. This will save a lot of time. res = elastic_utils.iterate_search(entry + "-median") potential_keywords = [] for median in res: breakdown = median["_source"]["five_minute_median"] if (total_in_five is 0): total_five_ratio = 0 elif (breakdown is 0): total_five_ratio = 0 elif (breakdown < 1): total_five_ratio = 1 else: total_five_ratio = total_in_five / breakdown if (total_five_ratio > 2.0): potential_keywords.append( (entry, total_five_ratio, entry, "Monthly")) yesterdays_res = median["_source"]["yesterday_res"] for key, value in word_counter.items(): current_word = word_counter[key] if (current_word > 5): if key in yesterdays_res: test_var = ( (yesterdays_res[key][0] / 24) / 60) * 5 current_word_ratio = current_word / test_var if key == entry: if (current_word_ratio > 2.5): potential_keywords.append( (entry, current_word_ratio, key, "Yesterday")) continue elif (current_word_ratio > 2.0): potential_keywords.append( (entry, current_word_ratio, key, "Yesterday")) existing_words = median["_source"]["day_words_median"] existing_dev = median["_source"]["standard_dev"] if (current_word > 5): if key in existing_words: existing_val = existing_words[key] existing_val = ((existing_val / 24) / 60) * 5 standard_dev_5_mins = ( (existing_dev[key] / 24) / 60) * 5 compared_to_monthly_ratio = current_word / existing_val if (current_word > (standard_dev_5_mins + existing_val + standard_dev_5_mins)): potential_keywords.append( (entry, (current_word - (standard_dev_5_mins + existing_val + standard_dev_5_mins)), key, "Deviation")) if (compared_to_monthly_ratio > 1.9): potential_keywords.append( (entry, compared_to_monthly_ratio, key, "Monthly")) if (current_word > 6 and key not in existing_words and key not in yesterdays_res): potential_keywords.append( (entry, current_word, key, "No Entries")) notification = check_percentage(entry, tweet_list, potential_keywords) if "total" in notification: print("--------") print(notification) print(notification["total"]) total_count += notification["total"] data = json.dumps({'job': total_count}) Group('notifications').send({'text': data})
def elastic_info(index_list): """Displays statistics from the topics.""" final_res = [] current_entry = 0 all_entries = [] for entry in index_list: index_dict = {} all_entries.append(entry) index_dict["name"] = {} index_dict["current_entry"] = entry if current_entry is 0: current_task.update_state(state='PROGRESS', meta={ 'current_percentage': 0, "current_entry": entry }) res = elastic_utils.search_index(entry, query={ "query": { "match_all": {} }, "sort": [{ "date": { "order": "desc" } }], "size": 10 }) current_array = [] for current in res["hits"]["hits"]: test = {} test["date"] = current["_source"]["date"] test["total"] = current["_source"]["total"] test["last_collected"] = current["_source"]["last_time"] current_array.append(test) index_dict["name"]["current"] = current_array median_array = [] res_median = elastic_utils.iterate_search(entry + "-median") for median in res_median: med = {} med["day_median"] = median["_source"]["day_median"] med["hour_median"] = median["_source"]["hour_median"] med["minute_median"] = median["_source"]["minute_median"] median_array.append(med) index_dict["name"]["median"] = median_array res_latest = elastic_utils.search_index(entry + "-latest", query={ "query": { "match_all": {} }, "sort": [{ "created.keyword": { "order": "desc" } }], "size": 5 }) latest_array = [] for item in res_latest["hits"]["hits"]: cur_entry = {} cur_entry["created"] = item["_source"]["created"] cur_entry["text"] = item["_source"]["text"] cur_entry["image"] = item["_source"]["profile_picture"] cur_entry["name"] = item["_source"]["name"] latest_array.append(cur_entry) index_dict["name"]["latest"] = latest_array all_entries.append(latest_array) if current_entry is not 0: current_task.update_state( state='PROGRESS', meta={ 'current_percentage': (current_entry / len(index_list)) * 100, 'current_entry': entry, 'final_res': final_res }) current_entry += 1 final_res.append(index_dict) print(len(final_res)) return final_res
def get_median(entry): """Calculates the median for every topic.""" # Now get yesterdays entries #I need to keep track of the value for words over each day, and also need day/hour breakdowns for each entry. day_breakdown = [] hour_breakdown = [] minute_breakdown = [] latest_words = {} day_res = elastic_utils.iterate_search(entry, query={ "query": { "match_all": {} }, "sort": [{ "date": { "order": "desc" } }] }) ##iterate through entries by date. day = 0 yesterday_res = {} for latest in day_res: try: hours = latest["_source"]["hour_breakdown"] except: hours = "No Tweets" continue #This is a words setup. if (hours != "No Tweets"): latest_ent = json.dumps(latest['_source']['words']) latest_ent = latest_ent.replace("\"[", "") latest_ent = latest_ent.replace("]\"", "") latest_ent = (latest_ent.split("], [")) for data in latest_ent: data = data.replace("[", "") data = data.replace("\"", "") data = data.replace("\\", "") data = data.replace("[\'", "") data = data.replace("\']", "") data = data.replace("]", "") terms_all = [data.split(", ")[0]] print(entry) total = [data.split(", ")[1]] if len(hours) < 24: total[0] = (int(total[0]) / int(len(hours))) * 24 if terms_all[0] in latest_words: if "." in terms_all[0]: terms_all[0] = terms_all[0].replace(".", "dot") elif "," in terms_all[0]: terms_all[0] = terms_all[0].replace(",", "comma") latest_words[terms_all[0]].append(int(total[0])) else: if "." in terms_all[0]: terms_all[0] = terms_all[0].replace(".", "dot") elif "," in terms_all[0]: terms_all[0] = terms_all[0].replace(",", "comma") latest_words[terms_all[0]] = [] latest_words[terms_all[0]].append(int(total[0])) if day is 0: if terms_all[0] in yesterday_res: if "." in terms_all[0]: terms_all[0] = terms_all[0].replace(".", "dot") elif "," in terms_all[0]: terms_all[0] = terms_all[0].replace(",", "comma") yesterday_res[terms_all[0]].append(int(total[0])) else: if "." in terms_all[0]: terms_all[0] = terms_all[0].replace(".", "dot") elif "," in terms_all[0]: terms_all[0] = terms_all[0].replace(",", "comma") yesterday_res[terms_all[0]] = [] yesterday_res[terms_all[0]].append(int(total[0])) #Now dealing with the breakdown over time if len(hours) is 24: day_breakdown.append(latest["_source"]["total"]) else: day_b = ( (latest["_source"]["total"] / len(hours)) * 24 ) # This is to combat when all entries aren't collected. day_breakdown.append(day_b) todays_hours = [] # A list of all the hours captured fors total.. for test in hours: todays_hours.append(hours[test]) todays_hours.sort() hour_med = statistics.median( todays_hours ) # gets the median for the hours for the specific day minute_estimate = hour_med / 60 # divide by 60 to get a minutes median hour_breakdown.append(hour_med) minute_breakdown.append(minute_estimate) day += 1 #Now to calculate setup. day_breakdown.sort() minute_breakdown.sort() hour_breakdown.sort() five_min_median = 0 count = elastic_utils.count_entries(entry) totals_array = add_zeros(latest_words, count) standard_dev = totals_array[1] totals_array = totals_array[0] five_min_word_breakdown = {} if (len(day_breakdown) != 0): day_median = statistics.median(day_breakdown) else: day_median = 0 if (len(minute_breakdown) != 0): minute_median = statistics.median(minute_breakdown) five_min_median = minute_median * 5 else: minute_median = 0 if (len(hour_breakdown) != 0): hour_median = statistics.median(hour_breakdown) else: hour_median = 0 es_obj = { "index": entry, "day_median": day_median, "minute_median": minute_median, "hour_median": hour_median, "five_minute_median": five_min_median, "day_words_median": totals_array, "yesterday_res": yesterday_res, "standard_dev": standard_dev } if "-median" not in entry: if elastic_utils.check_index_exists(entry + "-median") == False: elastic_utils.create_index(entry + "-median") elastic_utils.add_entry_median(entry + "-median", es_obj)
def run_nmf(n_samples, n_features, n_components, n_top_words): texts = [] res = elastic_utils.iterate_search( index_name=cfg.twitter_credentials['topic']) for i in res: texts.append(i['_source']['text']) # Use tf-idf features for NMF. print("Extracting tf-idf features for NMF...") tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english') t0 = time() tfidf = tfidf_vectorizer.fit_transform(texts) print("done in %0.3fs." % (time() - t0)) # Use tf (raw term count) features for LDA. print("Extracting tf features for LDA...") tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english') t0 = time() tf = tf_vectorizer.fit_transform(texts) print("done in %0.3fs." % (time() - t0)) print() # Fit the NMF model print("Fitting the NMF model (Frobenius norm) with tf-idf features, " "n_samples=%d and n_features=%d..." % (n_samples, n_features)) t0 = time() nmf = NMF(n_components=n_components, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf) print("done in %0.3fs." % (time() - t0)) print("\nTopics in NMF model (Frobenius norm):") tfidf_feature_names = tfidf_vectorizer.get_feature_names() print_top_words(nmf, tfidf_feature_names, n_top_words) # Fit the NMF model print( "Fitting the NMF model (generalized Kullback-Leibler divergence) with " "tf-idf features, n_samples=%d and n_features=%d..." % (n_samples, n_features)) t0 = time() nmf = NMF(n_components=n_components, random_state=1, beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1, l1_ratio=.5).fit(tfidf) print("done in %0.3fs." % (time() - t0)) print("\nTopics in NMF model (generalized Kullback-Leibler divergence):") tfidf_feature_names = tfidf_vectorizer.get_feature_names() print_top_words(nmf, tfidf_feature_names, n_top_words) tf_feature_names = tf_vectorizer.get_feature_names() categories = print_top_words(nmf, tf_feature_names, n_top_words) predict = nmf.transform(tf) result = {"predictions": predict, "text": texts, "categories": categories} return result
def timeline(request): if request.POST: print(request.POST) answer = request.POST['dropdown'] cat = TwitterCat.objects.filter(category_name=answer) name = "" for mod in cat: res = elastic_utils.iterate_search(index_name=mod.category_name, query={ "size": 20, "query": { "match_all": {} }, "sort": [{ "date": { "order": "desc" } }], }) med = elastic_utils.search_index(index_name=mod.category_name + "-median") name = mod.category_name break else: cat = TwitterCat.objects.filter(user=request.user) name = "" for mod in cat: res = elastic_utils.iterate_search(index_name=mod.category_name, query={ "size": 20, "query": { "match_all": {} }, "sort": [{ "date": { "order": "desc" } }], }) med = elastic_utils.search_index(index_name=mod.category_name + "-median") name = mod.category_name break cat = TwitterCat.objects.filter(user=request.user) data = {} i = 0 for entry in res: temp_data = {} for hour in entry["_source"]["hour_breakdown"]: temp_data[int(hour)] = (entry["_source"]["hour_breakdown"][hour]) data[entry["_source"]["date"]] = temp_data i += 1 if i == 20: break day_median = med["hits"]["hits"][0]["_source"]["day_median"] hour_median = med["hits"]["hits"][0]["_source"]["hour_median"] minute_median = med["hits"]["hits"][0]["_source"]["minute_median"] hour_med_tresh = round(hour_median * 2, 2) minute_med_tresh = round(minute_median * 2, 2) day_med_tresh = round(day_median * 1.5, 2) print(hour_med_tresh) #for entry in res: # temp_data = {} # for hour in entry["_source"]["hour_breakdown"]: # temp_data[int(hour)] = (entry["_source"]["hour_breakdown"][hour]) # data[entry["_source"]["date"]] = temp_data return render( request, "fyp/timeline/index.html", { "data": data, "name": name, "cats": cat, "hour_med_tresh": hour_med_tresh, "minute_med_tresh": minute_med_tresh, "day_med_tresh": day_med_tresh })