Exemplo n.º 1
0
 def __init__(self, delay=1, headers=None, cookies=None, proxies=None, num_retries=5, cache=None):
     self.throttle = Throttle(delay)
     self.headers = headers
     self.cookies = cookies
     self.proxies = ProxiesPool()
     self.num_retries = num_retries
     self.cache = cache
Exemplo n.º 2
0
 def __init__(self,
              delay=5,
              user_agent='wswp',
              proxies=None,
              num_retries=1,
              cache=None):
     self.throttle = Throttle(delay)
     self.user_agent = user_agent
     self.proxies = proxies
     self.num_retries = num_retries
     self.cache = cache
Exemplo n.º 3
0
def link_crawler(seed_url,
                 link_regex,
                 user_agent,
                 proxyD=None,
                 headers=None,
                 num_retries=1,
                 delay=2,
                 max_depth=-1,
                 max_urls=-1):
    rp = get_robots(seed_url)
    #Crawl from seed_url following links que cumplan el link_regex
    crawl_queue = Queue(0)
    crawl_queue.put(seed_url)
    # the URL's that have been seen and at what depth
    seen = {seed_url: 0}
    num_urls = 0

    throttle = Throttle(delay)
    headers = headers or {}
    if user_agent:
        headers['User-agent'] = user_agent

    while crawl_queue:
        url = crawl_queue.get()
        if rp.can_fetch(user_agent, url):
            throttle.wait(url)
            pagina = download(url, user_agent, proxyD, num_retries)
            links = []
            depth = seen[url]
            if depth != max_depth:
                links = get_links(pagina)
                #filtro para links que encajan con el link_regex
                for link in links:
                    link = normalize(seed_url, link)
                    if link not in seen and re.match(link_regex, link):
                        seen[link] = depth + 1
                        if same_domain(seed_url, link):
                            # success! add this new link to queue
                            crawl_queue.put(link)
            # check whether have reached downloaded maximum
            num_urls += 1
            if num_urls == max_urls:
                break
        else:
            print('Blocked by robots.txt:', url)
Exemplo n.º 4
0
class Downloader:
    def __init__(self,
                 delay=5,
                 user_agent='wswp',
                 proxies=None,
                 num_retries=1,
                 cache=None):
        self.throttle = Throttle(delay)
        self.user_agent = user_agent
        self.proxies = proxies
        self.num_retries = num_retries
        self.cache = cache

    def __call__(self, url):
        '''对象被直接调用时执行'''
        # 延时下载
        self.throttle.wait(url)
        # 如果代理列表不为空则随机返回一个代理地址
        proxy = random.choice(self.proxies) if self.proxies else None
        headers = {'User-agent': self.user_agent}
        return self.download(url, headers, proxy, self.num_retries)

    def download(self, url, headers, proxy, num_retries, data=None):
        print 'Downloading:', url
        request = urllib2.Request(url, data, headers)
        opener = urllib2.build_opener()
        if proxy:
            proxy_params = {urlparse.urlparse(url).scheme: proxy}
            opener.add_handler(urllib2.ProxyHandler(proxy_params))
        try:
            response = opener.open(request)
            html = response.read()
            code = response.code
        except urllib2.URLError as e:
            print 'Download error:', e.reason
            html = ''
            if hasattr(e, 'code'):
                code = e.code
                if num_retries > 0 and 500 <= code < 600:
                    # retry 5XX HTTP errors
                    return self.download(url, headers, proxy, num_retries - 1,
                                         data)
            else:
                code = None
        return {'code': code, 'html': html}
Exemplo n.º 5
0
class Downloader(object):
    def __init__(self, delay=1, headers=None, cookies=None, proxies=None, num_retries=5, cache=None):
        self.throttle = Throttle(delay)
        self.headers = headers
        self.cookies = cookies
        self.proxies = ProxiesPool()
        self.num_retries = num_retries
        self.cache = cache

    def __call__(self, url):
        result = None
        if self.cache:
            try:
                # load cache
                result = self.cache[url]
                if result['code'] == 200:
                    print('Loading html from cache ' + url)
                else:
                    result = None
            except KeyError:
                pass

        if result is None:
            result = self.download(url, self.headers, self.cookies, self.proxies, self.num_retries)
            if self.cache:
                self.cache[url] = result # 字典中存放字典{'html': html, 'code': code}
        return result['html']

    def download(self, url, headers, cookies, proxies, num_retries):
        print('Downloading ' + url)
        html = ''  # 当异常发生以及服务多次响应5XX,也能返回空字符串
        code = None
        proxies_to_pass = None
        proxy_ip = proxies.get_proxy() if proxies else None # 随机获取代理ip
        if proxy_ip:
            proxies_to_pass = {'http': "http://{}".format(proxy_ip[0])}
        if num_retries > 0:
            try:
                self.throttle.wait(url)
                print(proxies_to_pass)
                r = requests.get(url, headers=headers, cookies=cookies, proxies=proxies_to_pass, timeout=30)
                code = r.status_code
                print(code)
                r.raise_for_status()
                if code == 200:
                    html = r.text
                    # print(html)
                elif code and 500 < code < 600:
                    # retry 5XX HTTP errors
                    return self.download(url, headers, cookies, proxies, num_retries-1)
            except requests.exceptions.HTTPError:
                print('----Download error: ' + r.reason)
                return self.download(url, headers, cookies, proxies, num_retries-1)
            except requests.exceptions.Timeout:
                print('----Download error: Timeout, we download it again with another proxy.')
                return self.download(url, headers, cookies, proxies, num_retries-1)
            except:
                print('----Unknow error:May be CANNOT connect to proxy.')
                return self.download(url, headers, cookies, proxies, num_retries-1)
        else:
            print('----Unknow error:We download to much times.')
        result = {'html': html, 'code': code}
        if self.cache:
            self.cache[url] = result
        # print(result)
        return result
Exemplo n.º 6
0
def main():
    if len(sys.argv) > 1:
        config_path = sys.argv[1]
    else:
        config_path = './configs/config_default.txt'

    if not Path(config_path).is_file():
        logging.error("Could not find config file!")
        sys.exit(1)  # exiting with error code

    # load config
    config = configparser.ConfigParser()
    config.read(config_path)

    log_dir = config['PATHS']['log_dir']
    log_file_name = config['PATHS']['log_file_name']

    # check if config dir is present
    if not Path(log_dir).is_dir():
        logging.error("Logging directory is not present!")
        sys.exit(1)  # exiting with error code

    file_handler = TimedRotatingFileHandler(os.path.join(
        os.path.dirname(__file__), log_dir, log_file_name),
                                            when='midnight',
                                            interval=1)
    console_handler = logging.StreamHandler()
    logging.basicConfig(
        level=logging.DEBUG,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        handlers=[file_handler, console_handler])

    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)

    logging.getLogger("requests").setLevel(logging.WARNING)
    logging.getLogger("urllib3").setLevel(logging.WARNING)
    logging.getLogger("apscheduler.scheduler").setLevel(logging.WARNING)
    logging.getLogger("apscheduler.executors.default").setLevel(
        logging.WARNING)
    logging.getLogger("chardet.charsetprober").setLevel(logging.WARNING)

    logger.info("=======Starting=Crawler=========")

    # store config preferences in variables
    article_download_pattern = ([
        (int(config['ARTICLE_DOWNLOAD_PATTERN']['number']),
         int(config['ARTICLE_DOWNLOAD_PATTERN']['delay'])),
    ])  # [(application number, period in seconds) ... ]
    number_download_worker = int(config['CRAWLING']['number_download_worker'])
    website_request_timeout = int(
        config['REQUESTS']['website_request_timeout'])
    rss_feed_crawl_period = int(config['CRAWLING']['rss_feed_crawl_period'])
    rss_feed_request_timeout = int(
        config['REQUESTS']['rss_feed_request_timeout'])
    warmup_iterations = int(config['CRAWLING']['warmup_iterations'])
    throttle_velocity = float(config['CRAWLING']['throttle_velocity'])
    max_offset = int(config['CRAWLING']['max_offset'])
    downloads_path = config['PATHS']['downloads']
    crawled_rss_articles_path = config['PATHS']['rss_articles']
    feed_path = config['PATHS']['feeds_list']
    requests_path = config['PATHS']['requests']

    # partly validating the config
    if not Path(feed_path).is_file():
        logging.error("Could not find RSS feeds list file!")
        sys.exit(1)  # exiting with error code

    parent_dir = os.path.dirname(requests_path)
    if not Path(parent_dir).is_dir():
        logging.error("Could not find requests directory!")
        sys.exit(1)  # exiting with error code

    writer = Writer()
    writer.start()

    throttle = Throttle(request_velocity=throttle_velocity)

    rss_requester = Requester(tag="RSS Requester",
                              path=requests_path,
                              throttle=throttle)
    website_requester = Requester(tag="Website Requester",
                                  path=requests_path,
                                  throttle=throttle)

    scheduler = Scheduler(patterns=article_download_pattern)

    crawler = Crawler(requester=rss_requester,
                      scheduler=scheduler,
                      feed_path=feed_path,
                      crawled_rss_articles_path=crawled_rss_articles_path,
                      rss_feed_crawl_period=rss_feed_crawl_period,
                      rss_feed_request_timeout=rss_feed_request_timeout,
                      warmup_iterations=warmup_iterations,
                      max_offset=max_offset)
    crawler.start()

    for i in range(number_download_worker):
        logger.info("Starting download worker #%d", i)
        DownloadWorker(requester=website_requester,
                       timeout=website_request_timeout,
                       path=downloads_path).start()

    while True:
        time.sleep(60)
        logger.debug("Number of threads running: %d", threading.active_count())
        process = psutil.Process(os.getpid())
        ram_usage = process.memory_full_info()
        # percent = absolute/mem.total
        logger.info("RAM usage: %s%%,  %s", process.memory_percent(),
                    ram_usage)
Exemplo n.º 7
0
        depth = seen[url]
        # 如果深度到达约定数,则不再爬取链接
        if depth != max_depth:
            # 将每个html页面符合条件的url加入队列
            for link in get_links(html):
                if re.search(link_regex, link):
                    # 补全链接
                    link = urlparse.urljoin(seed_url, link)
                    # 如果已经入队,则不必再入队,避免重复下载
                    if link not in seen:
                        seen[link] = depth + 1
                        crawl_queue.append(link)

def get_links(html):
    """返回一个页面的所有链接"""
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
    return webpage_regex.findall(html)

# 回调函数
import lxml.html
def scrae_call(url, html):
    FIELLDS = ['area']
    if re.search('/view', url):
        tree = lxml.html.fromstring(html)
        row = [tree.cssselect('tr#places_%s__row > td.w2p_fw' % field)[0].text_content() for field in FIELLDS]
        print url, row

import Callback
link_crawler('http://example.webscraping.com/places/default/index', '/(index|view)',
             throttle=Throttle(1), max_depth=-1, scrae_callback=Callback.ScraeCallback())
Exemplo n.º 8
0
if __name__ == "__main__":
    gLog.open()

    name = "Bill"

    #  Start the simulator and controller
    sak = StartAndKill()
    sak.start("simulator")
    sak.start("controller")

    # Create the communication resources for 5 users
    comRes = CommunicationResources(name = 'gui thr test', host = 'localhost', port = 1235, numberOfPackages = 5)

    # Create a throttle to read the layout file
    throttle = Throttle(name = name, comPkg = comRes.getNextPackage())
    gLog.print("Main: read layout file")
    msg = throttle.readLayout("../../runSoftware/Layout.xml")
    sleep(2)
    responseFlag, code = (msg.responseFlag, msg.code)
    gLog.print("Main: finished reading layout file: responseFlag = {0} and code ={1}".format(responseFlag, code))
    if responseFlag != 1:
        print("ABEND")
        print("Error in XML file with flag = {0} and code = {1}".format(responseFlag, code))
        print("THE END")
        input("press enter to quit")
    throttle.close()

    # Start four gui throttles
    gLog.print("Main: begin start four GuiThrottleProcess")
    for i in range(4):
Exemplo n.º 9
0
from Controller import Controller

# Sensor object initialization
a = BMP180()
rp = MPU6050()
initialAltitude = a.getAltitude()
(initialRoll, initialPitch) = rp.getAngles()
initialYaw = 0 # currently unused

# Motor object initialization
frequency = 55.5
maxThrottle = 10
minThrottle = 5

# motorNumber, motorPin, frequency, maxThrottle, minThrottle
t1 = Throttle(1, 12, frequency, maxThrottle, minThrottle)
t2 = Throttle(2, 13, frequency, maxThrottle, minThrottle)
t3 = Throttle(3, 22, frequency, maxThrottle, minThrottle)
t4 = Throttle(4, 18, frequency, maxThrottle, minThrottle)

# Controller object initialization
# roll controller variables
rSetPoint = 0
rIntegralThreshold = 90
rKP = 0.9
rKI = 0
rKD = 0.1

# pitch controller variables
pSetPoint = 0
pIntegralThreshold = 90