def get_db_session(): """ Get database session to perform transactions """ try: db_session = SESSION() return db_session finally: db_session.close()
def save_record(SESSION, source_html, url, parent_id=None, visited=0): r = Recomendation() r.parent_id = parent_id r.url = url file = './pages/' + slugify(url.split('/')[4]) + '.html' r.file = file r.visited = visited #r.page_source = source_html if save_html(source_html, file): SESSION.add(r) SESSION.commit() return r
def _fire_api_request(items, site): item_ids = [x.id for x in items.all()] previous_max_id = SESSION.query(func.max(BodyfetcherMaxId.max_id))\ .filter(BodyfetcherMaxId.site_url == site).all()[0][0] intermediate_ids = list(range(previous_max_id + 1, min(item_ids))) fetch_ids = intermediate_ids + item_ids fetch_ids = fetch_ids[-min(len(fetch_ids), 100):] api_key = secrets['se_key'] uri = 'https://api.stackexchange.com/2.2/questions/{}?site={}&key={}'.format(';'.join(fetch_ids), site, api_key) API_REQUEST_LOCK.acquire() if datetime.utcnow() < BACKOFF_UNTIL: sleep((BACKOFF_UNTIL - datetime.utcnow()).total_seconds()) try: response = requests.get(uri, timeout=10) except (requests.exceptions.Timeout, requests.ConnectionError) as ex: tell_rooms_with('debug', "SE API request errored: {}".format(ex)) return API_REQUEST_LOCK.release() response = response.json() _clear_queue(fetch_ids, site) _process_response(response)
def get_last_messages(room, count): messages = SESSION.query(SmokeyMessage.message_id).filter( SmokeyMessage.chat_site_url == room._client.host, SmokeyMessage.room_id == room.id) for msg_id, in messages.order_by( SmokeyMessage.message_id.desc()).limit(count): yield room._client.get_message(msg_id)
def process_url(url_seed, parent_id): exists = SESSION.query(Recomendation).filter_by(url=url_seed).first() if not exists or exists.visited == 0: print('Obtendo o perfil seed :', url_seed) page_source = visit_page(url_seed) r = save_record(SESSION, page_source, url_seed, parent_id=parent_id, visited=1) soup = BeautifulSoup(page_source, 'html.parser') ul = soup.find( 'ul', class_= 'pv-profile-section__section-info section-info browsemap mt4') links = [] for a in ul.find_all( 'a', class_='pv-browsemap-section__member ember-view'): url = domain + a['href'] if len(url) < 255: links.append(url) visitados = [] sources = Parallel(n_jobs=-1)(delayed(visit_page)(link) for link in links) #print('############',len(sources)) for page_source, link in zip(sources, links): rr = save_record(SESSION, page_source, link, parent_id=r.id) visitados.append(rr) for v in visitados: v.visitado = 1 SESSION.commit()
def unnotify(msg, room_id, site) -> str: chat_host = msg._client.host user_id = msg.owner.id notifications = SESSION.query(Notification).filter( Notification.chat_site_url == chat_host, Notification.chat_user_id == user_id, Notification.room_id == room_id, Notification.site_url == site) BaseModel.delete_collection(notifications) return "You will no longer be notified of reports on `{}`, in room {} on chat.{}.".format( site, room_id, chat_host)
def _clear_queue(ids, site): col = SESSION.query(BodyfetcherQueueItem).filter(BodyfetcherQueueItem.post_id.in_(ids), BodyfetcherQueueItem.site_url == site) BaseModel.delete_collection(col)
async def health(db_session: SESSION = Depends(get_db_session)): """ Return 200 if site is up and healthy. At this point, we assume that being able to talk to the database means we're good to go. """ db_session.execute('SELECT 1') return Health(ok=db_session is not None)
def crawler(): def save_html(html_str, filename): Html_file = open(filename, "w") Html_file.write(html_str) Html_file.close() return True def save_record(SESSION, source_html, url, parent_id=None, visited=0): r = Recomendation() r.parent_id = parent_id r.url = url file = './pages/' + slugify(url.split('/')[4]) + '.html' r.file = file r.visited = visited #r.page_source = source_html if save_html(source_html, file): SESSION.add(r) SESSION.commit() return r def visit_page(url): print('Obtendo o perfil:', url) BROWSER = init_browser() BROWSER.get(url) cookies = pickle.load(open(cookie_file, "rb")) for cookie in cookies: BROWSER.add_cookie(cookie) BROWSER.refresh() BROWSER.get(url) WebDriverWait(BROWSER, timeout).until( EC.visibility_of_element_located( (By.CLASS_NAME, 'pv-deferred-area__content'))) page_source = BROWSER.page_source BROWSER.quit() return page_source def process_url(url_seed, parent_id): exists = SESSION.query(Recomendation).filter_by(url=url_seed).first() if not exists or exists.visited == 0: print('Obtendo o perfil seed :', url_seed) page_source = visit_page(url_seed) r = save_record(SESSION, page_source, url_seed, parent_id=parent_id, visited=1) soup = BeautifulSoup(page_source, 'html.parser') ul = soup.find( 'ul', class_= 'pv-profile-section__section-info section-info browsemap mt4') links = [] for a in ul.find_all( 'a', class_='pv-browsemap-section__member ember-view'): url = domain + a['href'] if len(url) < 255: links.append(url) visitados = [] sources = Parallel(n_jobs=-1)(delayed(visit_page)(link) for link in links) #print('############',len(sources)) for page_source, link in zip(sources, links): rr = save_record(SESSION, page_source, link, parent_id=r.id) visitados.append(rr) for v in visitados: v.visitado = 1 SESSION.commit() try: BROWSER = init_browser() # Login if cookie is not present if (not os.path.isfile(cookie_file)): print('--- Login ---') # Requests login page BROWSER.get(crawl_urls['login']) # Wait till the element is present on the DOM WebDriverWait(BROWSER, timeout).until( EC.visibility_of_element_located((By.ID, 'login-submit'))) # Does Authentication by filling form email and password and clicking on login button BROWSER.find_element_by_id('login-email').send_keys( credentials['email']) BROWSER.find_element_by_id('login-password').send_keys( credentials['password']) BROWSER.find_element_by_id('login-submit').click() # Wait till the 'core-rail' class is located WebDriverWait(BROWSER, timeout).until( EC.visibility_of_element_located((By.CLASS_NAME, 'core-rail'))) # Saves the cookies pickle.dump(BROWSER.get_cookies(), open(cookie_file, "wb")) # Taking screenshot BROWSER.save_screenshot(screenshot_dir + 'homepage_from_auth.png') else: # restore SESSION from cookie print('--- From cookie ---') BROWSER.get(crawl_urls['home']) # Loads the cookies and refresh the page cookies = pickle.load(open(cookie_file, "rb")) for cookie in cookies: BROWSER.add_cookie(cookie) BROWSER.refresh() # Wait till the element is located WebDriverWait(BROWSER, timeout).until( EC.visibility_of_element_located((By.CLASS_NAME, 'core-rail'))) process_url(SEED, None) for i in range(100): not_visited = SESSION.query(Recomendation).filter_by( visited=0).all() for v in not_visited: process_url(v.url, v.id) BROWSER.quit() except TimeoutException: print('Timed out waiting for page to load') # Taking screenshot BROWSER.save_screenshot(screenshot_dir + 'timeout_exception.png') BROWSER.quit()