def main(): data.city_corpus_dict() #seeds words table with city, word, prob(w/city) data.seed_words_table() data.city_tweet_corpus_dict() for city in cities: data.create_region_tweet_count(city) feature_selection.populate_db_with_features(city) data.create_tweet_total_count()
def classify_text(): tweet = request.form['tweet'] start = datetime.datetime.now() rankings = data.create_ranking(tweet) end = datetime.datetime.now() print 'getting city rankings takes: %s' % (end - start) start = datetime.datetime.now() top_5_words = feature_selection.top_words_in_tweet(rankings[0][0],tweet) end = datetime.datetime.now() print 'getting top 5 words takes: %s' % (end - start) start = datetime.datetime.now() cty_corpus_dict = data.city_corpus_dict() word_count_dict = cty_corpus_dict[rankings[0][0].name] end = datetime.datetime.now() print 'getting bogus word count dict takes: %s' % (end - start) start = datetime.datetime.now() final_result = [] for word in top_5_words: final_result.append(word) names = [] for i in range(0, len(rankings)): city_name = rankings[i][0].name names.append(city_name) end = datetime.datetime.now() print 'generating lists takes: %s' % (end - start) return render_template("map.html", tweet=tweet, names=names, rankings=rankings, final_result=final_result)
def get_corpus(): corpus = {} city_to_dict_of_words_to_counts = data.city_corpus_dict() for city in cities: city_corpus = city_to_dict_of_words_to_counts[city.name] for word in city_corpus.keys(): corpus[word] = True return corpus.keys()
def classify_text(): tweet = request.form['tweet'] start = datetime.datetime.now() rankings = data.create_ranking(tweet) end = datetime.datetime.now() print 'getting city rankings takes: %s' % (end - start) start = datetime.datetime.now() feature_strings_dict = {} city_corpus_leng_dict = {} city_tweet_count_dict = {} for city in cities: corpus_leng = data.find_leng_city_corpus(city) city_corpus_leng_dict[city.name] = corpus_leng city_tweet_count = data.create_region_tweet_count(city) city_tweet_count_dict[city.name] = city_tweet_count feature_strings = feature_selection.included_feature_strings( city, tweet) feature_strings_dict[city.name] = feature_strings end = datetime.datetime.now() print 'getting top 5 words takes: %s' % (end - start) start = datetime.datetime.now() cty_corpus_dict = data.city_corpus_dict() word_count_dict = cty_corpus_dict[rankings[0][0].name] end = datetime.datetime.now() print 'getting bogus word count dict takes: %s' % (end - start) start = datetime.datetime.now() names = [] for i in range(0, len(rankings)): city_name = rankings[i][0].name names.append(city_name) end = datetime.datetime.now() print 'generating lists takes: %s' % (end - start) return render_template("map.html", tweet=tweet, city_tweet_count_dict=city_tweet_count_dict, names=names, city_corpus_leng_dict=city_corpus_leng_dict, feature_strings_dict=feature_strings_dict, rankings=rankings)
def rank(city): city_corpus = data.city_corpus_dict() word_dict = city_corpus[city.name] feature_weight_list = [] count = 0 for word in word_dict.keys(): wf = get_tweet_word_counts(word, city) score = mutual_info_score(wf['N11'], wf['N10'], wf['N01'], wf['N00']) feature_weight_list.append((word, score)) count += 1 feature_weight_list.sort(key=operator.itemgetter(1), reverse=True) winner_list = [] feature_word_list = [t[0] for t in feature_weight_list] stop_words = {'through':1,'our':1,'about':1,'before':1, 'between':1, 'by':1, 'during':1, 'except':1, 'for':1, 'with':1, 'without':1,'in':1, 'how':1,'his':1, 'took':1, 'could':1, 'would':1, 'will':1, 'at':1, 'should':1, 'can':1, 'we':1, 'us':1, 'as':1,'at':1, 'him':1,'to':1,'sometimes':1, 'you':1, 'were':1, 'i':1, 'my':1, 'her':1, 'he':1,'me':1, 'this':1, 'was':1, 'had':1,'all':1, 'the':1, 'but':1, 'or':1, 'and':1,'there':1, 'it':1, 'is':1, 'then':1, 'a':1, 'an':1, 'be':1, 'for':1, 'of':1, 'what':1, 'when':1, 'why':1, 'where':1, 'are':1, 'am':1, 'because':1, 'they':1} for word in feature_word_list: if word not in stop_words and len(winner_list)<30: winner_list.append(word) return winner_list
def classify_text(): tweet = request.form['tweet'] start = datetime.datetime.now() rankings = data.create_ranking(tweet) end = datetime.datetime.now() print 'getting city rankings takes: %s' % (end - start) start = datetime.datetime.now() feature_strings_dict = {} city_corpus_leng_dict = {} city_tweet_count_dict = {} for city in cities: corpus_leng = data.find_leng_city_corpus(city) city_corpus_leng_dict[city.name] = corpus_leng city_tweet_count = data.create_region_tweet_count(city) city_tweet_count_dict[city.name] = city_tweet_count feature_strings = feature_selection.included_feature_strings(city, tweet) feature_strings_dict[city.name] = feature_strings end = datetime.datetime.now() print 'getting top 5 words takes: %s' % (end - start) start = datetime.datetime.now() cty_corpus_dict = data.city_corpus_dict() word_count_dict = cty_corpus_dict[rankings[0][0].name] end = datetime.datetime.now() print 'getting bogus word count dict takes: %s' % (end - start) start = datetime.datetime.now() names = [] for i in range(0, len(rankings)): city_name = rankings[i][0].name names.append(city_name) end = datetime.datetime.now() print 'generating lists takes: %s' % (end - start) return render_template("map.html", tweet=tweet, city_tweet_count_dict=city_tweet_count_dict, names=names, city_corpus_leng_dict=city_corpus_leng_dict, feature_strings_dict=feature_strings_dict, rankings=rankings)
def rank(city): city_corpus = data.city_corpus_dict() word_dict = city_corpus[city.name] feature_weight_list = [] count = 0 for word in word_dict.keys(): wf = get_tweet_word_counts(word, city) score = mutual_info_score(wf['N11'], wf['N10'], wf['N01'], wf['N00']) feature_weight_list.append((word, score)) count += 1 feature_weight_list.sort(key=operator.itemgetter(1), reverse=True) winner_list = [] feature_word_list = [t[0] for t in feature_weight_list] stop_words = { 'every': 1, 'got': 1, 'through': 1, 'our': 1, 'especially': 1, 'about': 1, 'before': 1, 'between': 1, 'by': 1, 'during': 1, 'except': 1, 'for': 1, 'with': 1, 'without': 1, 'in': 1, 'how': 1, 'his': 1, 'took': 1, 'could': 1, 'would': 1, 'will': 1, 'at': 1, 'should': 1, 'can': 1, 'we': 1, 'us': 1, 'as': 1, 'at': 1, 'him': 1, 'to': 1, 'sometimes': 1, 'you': 1, 'were': 1, 'i': 1, 'my': 1, 'her': 1, 'he': 1, 'me': 1, 'this': 1, 'was': 1, 'had': 1, 'all': 1, 'the': 1, 'but': 1, 'or': 1, 'and': 1, 'there': 1, 'it': 1, 'is': 1, 'then': 1, 'a': 1, 'an': 1, 'be': 1, 'for': 1, 'of': 1, 'what': 1, 'when': 1, 'why': 1, 'where': 1, 'are': 1, 'am': 1, 'because': 1, 'they': 1 } for word in feature_word_list: if word not in stop_words: winner_list.append(word) return winner_list