def scrap(url): r = requests.get(url) if r.status_code == 200: soup = BeautifulSoup(r.text, "html.parser") return soup else: d = DAO() d.writeLog('crawlerError', str(r.status_code) + ' ' + url) print('registered connection error!', str(r.status_code) + ' ' + url) sys.exit(1)
def periodically_delete_logs(): dao = DAO() today = datetime.datetime.today() week_ago = today - datetime.timedelta(days=7) cursor = dao.db.log for document in cursor.find({'date': {'$lt': week_ago}}): dao.db.log.delete_one(document) return
class French(): d = None data = [] url = 'https://fr.wikiquote.org/wiki/' def __init__(self): self.d = DAO() def Fetch_Fr(self, soup, url_name): data = self.d.getData() id = data[-1]['_id'] + 1 if len(data) > 0 else 1 quotes = [ filter(i.text) for i in soup.findAll('div', {'class': 'citation'}) ] source = [ filter(i.text) for i in soup.findAll('div', {'class': 'ref'}) ] author = soup.find('h1', {'id': 'firstHeading'}).text aux = [{'quotes': quotes, 'source': source}] flag = "🇫🇷" data = format('fr', author, aux, flag, url_name, id) self.d.save(data) def urlSetUp(self, author, language): if len(self.d.checkAuthor(author)) > 0: status = 'All quotes from ' + author + ' in ' + language + ' are up to date!' self.d.writeLog('status', status) print("Log registered!") return False else: print('****** formating url **** ') addr = self.url + author print(addr) self.Fetch_Fr(scrap(addr), author) return True
def __init__(self): self.d = DAO()