def create_requester(): req = requester( cookie_filename=os.path.join(os.path.dirname(__file__), 'cookies.txt')) req.caching = 'REQUESTER_CACHING' in os.environ req.time_out = 30 req.debug_output = 'REQUESTER_DEBUG' in os.environ return req
def get_external_resource(resource_url): """ @param {string} resource_url @return {string} http response if valid, o/w empty string """ response = RequesterModule.requester(resource_url) if RequesterModule.is_http_response_valid(response): return response return ''
def _get_data_external_links(scripts, driver=None): """ @param scripts: a list of HTML internal scripts and exernal script links (src) @returns: an ordered list containing inline scripts and the contents of the REACHABLE external script links """ data = [] if driver is None: # use python requests for item in scripts: script_type = item[0] if script_type == "external_script": link = item[1] d = RequesterModule.requester(link) if RequesterModule.is_http_response_valid(d): d_str = str(d).strip() if (not d_str.startswith("""<!doctype html>""")) and ( 'doctype html' not in d_str ): #ignore the case when resource is HTML, e.g, non-authenticated access via python requests data.append([script_type, d]) else: ## no valid content print("+ InvalidResourceURL encountered!") continue else: data.append(item) return data else: # use browser for item in scripts: script_type = item[0] if script_type == "external_script": link = item[1] current_handle = driver.current_window_handle driver.execute_script( """window.open('', '_blank')""") # new tab time.sleep(1) driver.switch_to_window(driver.window_handles[1]) driver.get(link) time.sleep(1) d = driver.page_source driver.close() # closes the new tab driver.switch_to_window(current_handle) dp = BeautifulSoup(d, 'html.parser') d_str = dp.find( 'pre', recursive=True) # js is rendered in a pre tag in chrome if d_str is None: continue else: d_str = d_str.text # get the 'pre' tag content if ( not d_str.startswith("""<!doctype html>""") ): #ignore the case when resource is HTML, e.g, non-authenticated access via python requests data.append([script_type, d_str]) else: ## no valid content print("+ InvalidResourceURL encountered!") continue else: data.append(item) return data
#!/usr/bin/env python # -*- coding: utf-8 -*- import os from copy import deepcopy from datetime import timedelta from abc import ABCMeta, abstractmethod import logging from utils.requester import requester from utils.requester import FailOnGetResponse # noqa REQ = requester( cookie_filename=os.path.join(os.path.dirname(__file__), 'cookies.txt')) REQ.caching = 'REQUESTER_CACHING' in os.environ REQ.time_out = 23 REQ.debug_output = 'REQUESTER_DEBUG' in os.environ SPACE = ' ' DOT = '.' ch = logging.StreamHandler() ch.setFormatter( logging.Formatter( '%(asctime)s - %(levelname)-10s - %(name)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')) ch.setLevel(logging.DEBUG) LOG = logging.getLogger('ranking.modules') LOG.setLevel(logging.INFO) LOG.addHandler(ch)