Exemplo n.º 1
0
Arquivo: main.py Projeto: bcgov/epr
def get_db_session():
    """ Get database session to perform transactions """
    try:
        db_session = SESSION()
        return db_session
    finally:
        db_session.close()
Exemplo n.º 2
0
 def save_record(SESSION, source_html, url, parent_id=None, visited=0):
     r = Recomendation()
     r.parent_id = parent_id
     r.url = url
     file = './pages/' + slugify(url.split('/')[4]) + '.html'
     r.file = file
     r.visited = visited
     #r.page_source = source_html
     if save_html(source_html, file):
         SESSION.add(r)
         SESSION.commit()
     return r
Exemplo n.º 3
0
def _fire_api_request(items, site):
    item_ids = [x.id for x in items.all()]
    previous_max_id = SESSION.query(func.max(BodyfetcherMaxId.max_id))\
                             .filter(BodyfetcherMaxId.site_url == site).all()[0][0]
    intermediate_ids = list(range(previous_max_id + 1, min(item_ids)))
    fetch_ids = intermediate_ids + item_ids
    fetch_ids = fetch_ids[-min(len(fetch_ids), 100):]

    api_key = secrets['se_key']
    uri = 'https://api.stackexchange.com/2.2/questions/{}?site={}&key={}'.format(';'.join(fetch_ids), site, api_key)

    API_REQUEST_LOCK.acquire()
    if datetime.utcnow() < BACKOFF_UNTIL:
        sleep((BACKOFF_UNTIL - datetime.utcnow()).total_seconds())

    try:
        response = requests.get(uri, timeout=10)
    except (requests.exceptions.Timeout, requests.ConnectionError) as ex:
        tell_rooms_with('debug', "SE API request errored: {}".format(ex))
        return

    API_REQUEST_LOCK.release()
    response = response.json()

    _clear_queue(fetch_ids, site)
    _process_response(response)
Exemplo n.º 4
0
def get_last_messages(room, count):
    messages = SESSION.query(SmokeyMessage.message_id).filter(
        SmokeyMessage.chat_site_url == room._client.host,
        SmokeyMessage.room_id == room.id)

    for msg_id, in messages.order_by(
            SmokeyMessage.message_id.desc()).limit(count):
        yield room._client.get_message(msg_id)
Exemplo n.º 5
0
    def process_url(url_seed, parent_id):

        exists = SESSION.query(Recomendation).filter_by(url=url_seed).first()

        if not exists or exists.visited == 0:

            print('Obtendo o perfil seed :', url_seed)
            page_source = visit_page(url_seed)

            r = save_record(SESSION,
                            page_source,
                            url_seed,
                            parent_id=parent_id,
                            visited=1)

            soup = BeautifulSoup(page_source, 'html.parser')
            ul = soup.find(
                'ul',
                class_=
                'pv-profile-section__section-info section-info browsemap mt4')

            links = []

            for a in ul.find_all(
                    'a', class_='pv-browsemap-section__member ember-view'):
                url = domain + a['href']
                if len(url) < 255:
                    links.append(url)

            visitados = []

            sources = Parallel(n_jobs=-1)(delayed(visit_page)(link)
                                          for link in links)

            #print('############',len(sources))

            for page_source, link in zip(sources, links):
                rr = save_record(SESSION, page_source, link, parent_id=r.id)
                visitados.append(rr)

            for v in visitados:
                v.visitado = 1
                SESSION.commit()
Exemplo n.º 6
0
def unnotify(msg, room_id, site) -> str:
    chat_host = msg._client.host
    user_id = msg.owner.id

    notifications = SESSION.query(Notification).filter(
        Notification.chat_site_url == chat_host,
        Notification.chat_user_id == user_id, Notification.room_id == room_id,
        Notification.site_url == site)
    BaseModel.delete_collection(notifications)

    return "You will no longer be notified of reports on `{}`, in room {} on chat.{}.".format(
        site, room_id, chat_host)
Exemplo n.º 7
0
def _clear_queue(ids, site):
    col = SESSION.query(BodyfetcherQueueItem).filter(BodyfetcherQueueItem.post_id.in_(ids),
                                                     BodyfetcherQueueItem.site_url == site)
    BaseModel.delete_collection(col)
Exemplo n.º 8
0
Arquivo: main.py Projeto: bcgov/epr
async def health(db_session: SESSION = Depends(get_db_session)):
    """ Return 200 if site is up and healthy. At this point, we assume that being able to
    talk to the database means we're good to go. """
    db_session.execute('SELECT 1')
    return Health(ok=db_session is not None)
Exemplo n.º 9
0
def crawler():
    def save_html(html_str, filename):
        Html_file = open(filename, "w")
        Html_file.write(html_str)
        Html_file.close()
        return True

    def save_record(SESSION, source_html, url, parent_id=None, visited=0):
        r = Recomendation()
        r.parent_id = parent_id
        r.url = url
        file = './pages/' + slugify(url.split('/')[4]) + '.html'
        r.file = file
        r.visited = visited
        #r.page_source = source_html
        if save_html(source_html, file):
            SESSION.add(r)
            SESSION.commit()
        return r

    def visit_page(url):
        print('Obtendo o perfil:', url)
        BROWSER = init_browser()

        BROWSER.get(url)

        cookies = pickle.load(open(cookie_file, "rb"))
        for cookie in cookies:
            BROWSER.add_cookie(cookie)

        BROWSER.refresh()

        BROWSER.get(url)

        WebDriverWait(BROWSER, timeout).until(
            EC.visibility_of_element_located(
                (By.CLASS_NAME, 'pv-deferred-area__content')))

        page_source = BROWSER.page_source
        BROWSER.quit()

        return page_source

    def process_url(url_seed, parent_id):

        exists = SESSION.query(Recomendation).filter_by(url=url_seed).first()

        if not exists or exists.visited == 0:

            print('Obtendo o perfil seed :', url_seed)
            page_source = visit_page(url_seed)

            r = save_record(SESSION,
                            page_source,
                            url_seed,
                            parent_id=parent_id,
                            visited=1)

            soup = BeautifulSoup(page_source, 'html.parser')
            ul = soup.find(
                'ul',
                class_=
                'pv-profile-section__section-info section-info browsemap mt4')

            links = []

            for a in ul.find_all(
                    'a', class_='pv-browsemap-section__member ember-view'):
                url = domain + a['href']
                if len(url) < 255:
                    links.append(url)

            visitados = []

            sources = Parallel(n_jobs=-1)(delayed(visit_page)(link)
                                          for link in links)

            #print('############',len(sources))

            for page_source, link in zip(sources, links):
                rr = save_record(SESSION, page_source, link, parent_id=r.id)
                visitados.append(rr)

            for v in visitados:
                v.visitado = 1
                SESSION.commit()

    try:

        BROWSER = init_browser()
        # Login if cookie is not present
        if (not os.path.isfile(cookie_file)):
            print('--- Login ---')
            # Requests login page
            BROWSER.get(crawl_urls['login'])

            # Wait till the element is present on the DOM
            WebDriverWait(BROWSER, timeout).until(
                EC.visibility_of_element_located((By.ID, 'login-submit')))

            # Does Authentication by filling form email and password and clicking on login button
            BROWSER.find_element_by_id('login-email').send_keys(
                credentials['email'])
            BROWSER.find_element_by_id('login-password').send_keys(
                credentials['password'])
            BROWSER.find_element_by_id('login-submit').click()

            # Wait till the 'core-rail' class is located
            WebDriverWait(BROWSER, timeout).until(
                EC.visibility_of_element_located((By.CLASS_NAME, 'core-rail')))

            # Saves the cookies
            pickle.dump(BROWSER.get_cookies(), open(cookie_file, "wb"))

            # Taking screenshot
            BROWSER.save_screenshot(screenshot_dir + 'homepage_from_auth.png')
        else:  # restore SESSION from cookie
            print('--- From cookie ---')
            BROWSER.get(crawl_urls['home'])

            # Loads the cookies and refresh the page
            cookies = pickle.load(open(cookie_file, "rb"))
            for cookie in cookies:
                BROWSER.add_cookie(cookie)

            BROWSER.refresh()

            # Wait till the element is located
            WebDriverWait(BROWSER, timeout).until(
                EC.visibility_of_element_located((By.CLASS_NAME, 'core-rail')))

        process_url(SEED, None)

        for i in range(100):

            not_visited = SESSION.query(Recomendation).filter_by(
                visited=0).all()

            for v in not_visited:
                process_url(v.url, v.id)

        BROWSER.quit()

    except TimeoutException:
        print('Timed out waiting for page to load')
        # Taking screenshot
        BROWSER.save_screenshot(screenshot_dir + 'timeout_exception.png')
        BROWSER.quit()