# -*- coding: utf-8 -*- from goose import Goose import pymongo from bs4 import BeautifulSoup import requests import datetime import zlib import cPickle as CP import cld from requests.exceptions import ConnectionError, Timeout import bson import settings import logging_mc logger = logging_mc.get_logger('valor') client = pymongo.MongoClient(settings.MONGOHOST, 27017) MCDB = client.MCDB ARTICLES = MCDB.articles # Article Collection ARTICLES.ensure_index("source") def find_articles(): """ Get the urls of last news :return: last news' urls of all categories :rtype: set() """ urls = ['http://www.valor.com.br/ultimas-noticias/brasil', 'http://www.valor.com.br/ultimas-noticias/politica', 'http://www.valor.com.br/ultimas-noticias/financas', 'http://www.valor.com.br/ultimas-noticias/empresas',
# -*- coding: utf-8 -*- from goose import Goose import pymongo from bs4 import BeautifulSoup import requests import datetime import zlib import cPickle as CP import cld from requests.exceptions import ConnectionError, Timeout import bson import settings import logging_mc import re logger = logging_mc.get_logger('ZH') client = pymongo.MongoClient(settings.MONGOHOST, 27017) MCDB = client.MCDB ARTICLES = MCDB.articles # Article Collection ARTICLES.ensure_index("source") def find_articles(): """ Get the urls of last news :return: last news' urls of all categories :rtype: set() """ urls = ['http://zh.clicrbs.com.br/rs/noticias/ultimas-noticias/', 'http://zh.clicrbs.com.br/rs/entretenimento/ultimas-noticias/', 'http://zh.clicrbs.com.br/rs/esportes/ultimas-noticias/',