def parse_data(tablename,site,url_list): for url in url_list: data=feedparser.parse(url) for news in data.entries: title=news['title'] link=news['link'] try: description=news['content'][0]['value'] description=re.sub('<.*>','',description) except: description=news['summary'] description=re.sub('<.*>','',description) slug=slugify(title) collection_obj=CollectionMapping(tablename) collection_obj.load_json({'site':site,'slug':slug,'name':title,'description':description,'link':link,})
except: description=news['summary'] description=re.sub('<.*>','',description) slug=slugify(title) collection_obj=CollectionMapping(tablename) collection_obj.load_json({'site':site,'slug':slug,'name':title,'description':description,'link':link,}) if __name__ == "__main__": # delete old news CollectionMapping('news_news').delete_all() CollectionMapping('news_category').delete_all() # fetch news feed for tablename, url_dict in news_dict.items(): for site,url in url_dict.items(): parse_data(tablename,site,url) # add training data category_set=TrainClassifier.train_classifier(news_training_dict) category_dict=dict([(slugify(category),1) for category in news_training_dict.keys()]) # classify each document for news in CollectionMapping('news_news').objects.all(): bayes_obj=BayesClassifier(category_set) # returns each obj with category_list attribute ordered according to their score obj=bayes_obj.find_posterior("%s %s"%(news.name,news.description)) news.update(category_list=map(lambda category:category[0],obj.category_list)) for category,score in obj.category_list:category_dict[category]+=1 category_obj=CollectionMapping('news_category') category_obj.save(type="education",category=category_dict)