Exemplo n.º 1
0
def scrap(url):
    r = requests.get(url)
    if r.status_code == 200:
        soup = BeautifulSoup(r.text, "html.parser")
        return soup
    else:
        d = DAO()
        d.writeLog('crawlerError', str(r.status_code) + ' ' + url)
        print('registered connection error!', str(r.status_code) + ' ' + url)
        sys.exit(1)
Exemplo n.º 2
0
def periodically_delete_logs():
    dao = DAO()
    today = datetime.datetime.today()
    week_ago = today - datetime.timedelta(days=7)
    cursor = dao.db.log

    for document in cursor.find({'date': {'$lt': week_ago}}):
        dao.db.log.delete_one(document)
    return
Exemplo n.º 3
0
class French():
    d = None
    data = []
    url = 'https://fr.wikiquote.org/wiki/'

    def __init__(self):
        self.d = DAO()

    def Fetch_Fr(self, soup, url_name):
        data = self.d.getData()
        id = data[-1]['_id'] + 1 if len(data) > 0 else 1
        quotes = [
            filter(i.text) for i in soup.findAll('div', {'class': 'citation'})
        ]
        source = [
            filter(i.text) for i in soup.findAll('div', {'class': 'ref'})
        ]
        author = soup.find('h1', {'id': 'firstHeading'}).text
        aux = [{'quotes': quotes, 'source': source}]
        flag = "🇫🇷"
        data = format('fr', author, aux, flag, url_name, id)
        self.d.save(data)

    def urlSetUp(self, author, language):
        if len(self.d.checkAuthor(author)) > 0:
            status = 'All quotes from ' + author + ' in ' + language + '  are up to date!'
            self.d.writeLog('status', status)
            print("Log registered!")
            return False
        else:
            print('****** formating url **** ')
            addr = self.url + author
            print(addr)
            self.Fetch_Fr(scrap(addr), author)
            return True
Exemplo n.º 4
0
 def __init__(self):
     self.d = DAO()