def getCategories_2(): """ debugging function Takes all the news posts from the db, classifies them (with NN probably) and returns them as a log :return: returns a log """ news_posts = crawler.take_all_news_posts() categories = classification.classify_posts(news_posts) #str = '' #for cat in categories: # str += '%s\n' % cat # # for newsPost in categories[cat]: # str += '\t%s\n' % newsPost.title # # str += '\n' str = '%d\n' % len(categories) for cat in categories: str += '%s %d\n' % (cat, len(categories[cat])) for cat in categories: str += '%s\n' % cat for np in categories[cat]: str += '\t%s\n' % np.title str += '\n' return Response(str, mimetype='text/plain')
def getCategoriesNB(): response = '' feedback = '' try: newsPosts = crawler.take_all_news_posts() fileToRead = open(naivebayes_classification.str_dict_word_in_cat) dict_words = Unpickler(fileToRead).load() fileToRead.close() fileToRead = open(naivebayes_classification.str_dict_cat_count) dict_cats = Unpickler(fileToRead).load() fileToRead.close() fileToRead = open(naivebayes_classification.str_dict_priors) dict_priors = Unpickler(fileToRead).load() fileToRead.close() dict_results = {} for np in newsPosts: #words, dict_words, dict_cats, dict_priors category = test_classifications.get_NB_category(np.words, dict_words, dict_cats, dict_priors) dict_results.setdefault(category, []) dict_results[category].append(np) response += 'number of documents: %d\n' % (len(newsPosts)) for cat in dict_results: response += '%s\t\t%d\n' % (cat, len(dict_results.get(cat, []))) for cat in dict_results: response += '%s\n' % cat for np in dict_results[cat]: response += '\t%s\n' % np.title response += '\n' except Exception as inst: feedback += 'Exception type: %s\n' % type(inst) feedback += 'Exception: %s\n' % inst.message response += feedback return Response(response, mimetype='text/plain')
def getNewsPosts(): """ debugging function takes all the news posts there are and lists them to the client :return: returns a log """ newsPosts = crawler.take_all_news_posts() str = '' i = 0 for np in newsPosts: str += '%d: %s %d %s\n' % (i , np.title, np.source_id, np.source_url) i += 1 return Response(str, mimetype='text/plain')
def getLinks(): """ debugging function that lists the titles of news posts with their 10 most important (tf-idf based) words :return: returns a log """ newsPosts = crawler.take_all_news_posts() str = '' for np in newsPosts: str += np.title + '\n'; pairs = sorted([(word, np.dict_tf_idf[word]) for word in np.dict_tf_idf], key=lambda x:-x[1]) for pair in pairs[0:min(len(pairs),10)]: str += '%s %f\n' % (pair[0], pair[1]) str += '\n' return Response(str, mimetype='text/plain')
def getClusters(): feedback = '' str = '' try: newsPosts = crawler.take_all_news_posts() # utility dicts for majority voting with naive bayes fileToRead = open(naivebayes_classification.str_dict_word_in_cat) dict_words = Unpickler(fileToRead).load() fileToRead.close() fileToRead = open(naivebayes_classification.str_dict_cat_count) dict_cats = Unpickler(fileToRead).load() fileToRead.close() fileToRead = open(naivebayes_classification.str_dict_priors) dict_priors = Unpickler(fileToRead).load() fileToRead.close() feedback += 'took the newsposts \n' #return Response('%d' % counter, mimetype='text/plain') clusters, innerfeedback = clustering.cluster_news(newsPosts) feedback += '%s\n' % innerfeedback feedback += 'done the clustering\n' i = 0 feedback += 'num of clusters: %d\n' % len(clusters) clusters = sorted(clusters, key=lambda x:-len(x.posts)) for c in clusters: feedback += 'getting posts from cluster\n' newsInCluster = c.posts feedback += 'got the posts from cluster\n' str += 'cluster %d\n' % i #implementing the majority voting votes_cat = {} for np in newsInCluster: str += ' \t %s\n' % np.title category = test_classifications.get_NB_category(np.words,dict_words, dict_cats, dict_priors) votes_cat[category] = 1 + votes_cat.get(category, 0) maxVotes = 0 maxCat = '' for cat in votes_cat: if votes_cat[cat] > maxVotes: maxVotes = votes_cat[cat] maxCat = cat feedback += '^^^ CLUSTER CATEGORY: %s with maxVotes: %d\n' % (maxCat, maxVotes) listNews = [] feedback += ' number of posts in cluster %d\n' % len(c.posts) for np in c.posts: feedback += 'trying to create NewsPostClient\n' feedback += 'title: %s \n' % np.title feedback += 'numWords: %d\n' % np.numWords feedback += 'url: %s\n' % np.url newNews = NewsPostClient(url = np.url, host_page = np.host_page, title = np.title, numWords = np.numWords, source_id = np.source_id, source_url = np.source_url, img_url = np.img_url, description = np.description) feedback += 'created NewsPostClient' listNews.append(newNews) feedback += 'appended newNews\n' newCluster = Cluster(category = maxCat, listNews = listNews) newCluster.put() str += '\n' i += 1 str += feedback except Exception as inst: feedback += 'Exception type: %s\n' % (type(inst)) feedback += 'Exception: %s\n' % (inst.message) str += feedback return Response(str, mimetype='text/plain')