def getNews(self): se = Searcher() field = se.getall() adj = self.getAdj() res = Resources() classifier = self.getClassifier(adj) stopwords = nltk.corpus.stopwords.words('portuguese') for a in field: #Obtem todas as noticias indexadas no woosh try: n_id = int(float(field[a]["id"])) n_title = field[a]["title"] n_link = field[a]["link"] n_content = field[a]["content"] try: namesread = codecs.open("libra/politics/management/commands/utils/resources/names_plainText.txt",'r','latin-1') entidades = res.encontraNomes(n_title + "" + n_content, ''); print entidades except UnicodeEncodeError: print "Erro a encontrar entidades" news = News.objects.create_Simple_News(n_id, n_title, n_link, n_content) Future(self.insereBD, entidades, news, adj, classifier, stopwords) except UnicodeEncodeError: print "Erro de encode aqui"
def single_news(request, n_id): template = 'entity_content.html' news = News.objects.get(id = n_id) lista_pessoas = [] for p in news.entities.all(): lista_pessoas.append(p) n_title = news.content; n_content = news.title; path = os.path.join(SITE_ROOT, "politics/management/commands/utils/resources/names_plainText.txt") namesread = codecs.open(path,'r',encoding='latin-1') res = Resources() entities = res.encontraNomes(n_title + "" + n_content, namesread) c = Context({'news': news, 'lista_pessoas': lista_pessoas, 'entidades':entities}) return render(request, template, c)
def parseFeeds(self): # print "esta aqui" # print Searcher().getcount() adj = self.getAdj() res = Resources() classifier = self.getClassifier(adj) hit_list = [ "http://feeds.jn.pt/JN-Politica", "http://feeds.dn.pt/DN-Politica", "http://economico.sapo.pt/rss/politica" ] # list of feeds to pull down future_calls = [feedparser.parse(rss_url) for rss_url in hit_list] # block until they are all in feeds = [x for x in future_calls] print feeds entries = [] for feed in feeds: entries.extend(feed[ "items" ]) for e in entries: n_title = e['title'] # print "vai adicionar", n_title n_link = e['link'] summary = re.split("<", e['summary']) n_content = summary[0] try: entidades = res.encontraNomes(n_title + "" + n_content, ''); except UnicodeEncodeError: print "Erro a encontrar entidades" try: #indexer.clean_index("libra/politics/indexdir") thisid = indexer.index_my_news(n_title,n_link,n_content, "libra/politics/indexdir", False) n_id = thisid print "searcher ",Searcher().getcount(), thisid if thisid: news = News.objects.create_Simple_News(n_id, n_title, n_link, n_content) Future(self.insereBD,entidades, news, adj, classifier) except TypeError: print n_title