Пример #1
0
def create_requester():
    req = requester(
        cookie_filename=os.path.join(os.path.dirname(__file__), 'cookies.txt'))
    req.caching = 'REQUESTER_CACHING' in os.environ
    req.time_out = 30
    req.debug_output = 'REQUESTER_DEBUG' in os.environ
    return req
Пример #2
0
def get_external_resource(resource_url):
    """
	@param {string} resource_url
	@return {string} http response if valid, o/w empty string
	"""

    response = RequesterModule.requester(resource_url)
    if RequesterModule.is_http_response_valid(response):
        return response
    return ''
Пример #3
0
def _get_data_external_links(scripts, driver=None):
    """
	@param scripts: a list of HTML internal scripts and exernal script links (src)
	@returns: an ordered list containing inline scripts and 
			  the contents of the REACHABLE external script links
	"""
    data = []
    if driver is None:
        # use python requests
        for item in scripts:
            script_type = item[0]
            if script_type == "external_script":
                link = item[1]
                d = RequesterModule.requester(link)
                if RequesterModule.is_http_response_valid(d):
                    d_str = str(d).strip()
                    if (not d_str.startswith("""<!doctype html>""")) and (
                            'doctype html' not in d_str
                    ):  #ignore the case when resource is HTML, e.g, non-authenticated access via python requests
                        data.append([script_type, d])
                else:
                    ## no valid content
                    print("+ InvalidResourceURL encountered!")
                    continue
            else:
                data.append(item)
        return data
    else:
        # use browser
        for item in scripts:
            script_type = item[0]
            if script_type == "external_script":
                link = item[1]
                current_handle = driver.current_window_handle
                driver.execute_script(
                    """window.open('', '_blank')""")  # new tab
                time.sleep(1)
                driver.switch_to_window(driver.window_handles[1])
                driver.get(link)
                time.sleep(1)
                d = driver.page_source
                driver.close()  # closes the new tab
                driver.switch_to_window(current_handle)

                dp = BeautifulSoup(d, 'html.parser')
                d_str = dp.find(
                    'pre',
                    recursive=True)  # js is rendered in a pre tag in chrome
                if d_str is None:
                    continue
                else:
                    d_str = d_str.text  # get the 'pre' tag content

                if (
                        not d_str.startswith("""<!doctype html>""")
                ):  #ignore the case when resource is HTML, e.g, non-authenticated access via python requests
                    data.append([script_type, d_str])
                else:
                    ## no valid content
                    print("+ InvalidResourceURL encountered!")
                    continue
            else:
                data.append(item)
        return data
Пример #4
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os
from copy import deepcopy
from datetime import timedelta
from abc import ABCMeta, abstractmethod
import logging

from utils.requester import requester
from utils.requester import FailOnGetResponse  # noqa

REQ = requester(
    cookie_filename=os.path.join(os.path.dirname(__file__), 'cookies.txt'))
REQ.caching = 'REQUESTER_CACHING' in os.environ
REQ.time_out = 23
REQ.debug_output = 'REQUESTER_DEBUG' in os.environ

SPACE = ' '
DOT = '.'

ch = logging.StreamHandler()
ch.setFormatter(
    logging.Formatter(
        '%(asctime)s - %(levelname)-10s - %(name)s - %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S'))
ch.setLevel(logging.DEBUG)

LOG = logging.getLogger('ranking.modules')
LOG.setLevel(logging.INFO)
LOG.addHandler(ch)