Пример #1
0
def main(max_threads):
    scrape_callback = AlexaCallback()
    cache = MongoCache()
    cache.clear()
    process_crawler(scrape_callback.seed_url,
                    scrape_callback=scrape_callback,
                    cache=cache,
                    max_threads=max_threads,
                    timeout=10)
Пример #2
0
def main(max_threads):
    from mongo_cache import MongoCache
    from alexa_cb import AlexaCallback
    scrape_callback = AlexaCallback()
    cache = MongoCache()
    cache.clear()
    process_crawler(scrape_callback.seed_url,
                    scrape_callback=scrape_callback,
                    cache=cache,
                    max_threads=max_threads,
                    timeout=10)  # process_crawler
Пример #3
0
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None,\
 user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60):
    """Crawl using multiple threads
    """
    # the queue of URL's that still need to be crawled
    crawl_queue = MongoQueue()
    webpage_cache = MongoCache()
    # crawl_queue.clear()
    crawl_queue.push(seed_url)
    D = Downloader(delay=DEFAULT_DELAY, user_agent=DEFAULT_AGENT, proxies=DEFAULT_PROXY_LIST, \
            cookies = DEFAULT_COOKIE, num_retries=DEFAULT_RETRIES, timeout=DEFAULT_TIMEOUT, \
            opener=None, cache=MongoCache())

    def process_queue():
        while True:
            # keep track that are processing url
            try:
                url = crawl_queue.pop()
            except KeyError:
                # currently no urls to process
                break
            else:
                html = D(url)
                if scrape_callback:
                    try:
                        links = scrape_callback(url, html) or []
                    except Exception as e:
                        print('Error in callback for: {}: {}'.format(url, e))
                    else:
                        for link in links:
                            # add this new link to queue
                            crawl_queue.push(normalize(seed_url, link))
            if (500 <= webpage_cache[url]['code'] <
                    600) | (webpage_cache[url]['code'] == -999):
                crawl_queue.reset(url)
            else:
                crawl_queue.complete(url)

    # wait for all download threads to finish
    threads = []
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue.peek():
            # can start some more threads
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(
                True
            )  # set daemon so main thread can exit when receives ctrl-c
            thread.start()
            threads.append(thread)
        time.sleep(SLEEP_TIME)
Пример #4
0
    def __init__(self, mongo_host, mongo_port):

        last_lock = SimpleMongoServiceLock(mongo_host, mongo_port,
                                           'music_tour', 'last_lock', 1, 30)
        self.last_fm = LastFmService(
            MongoCache(mongo_host, mongo_port, 'music_tour', 'last_cache',
                       timedelta(weeks=24)), last_lock)
        spotify_lock = SimpleMongoServiceLock(mongo_host, mongo_port,
                                              'music_tour', 'spotify_lock', 1,
                                              30)
        self.spotify = SpotifyMetaService(
            MongoCache(mongo_host, mongo_port, 'music_tour', 'spotify_cache',
                       timedelta(weeks=24)), spotify_lock)
Пример #5
0
def main():
    scrape_callback = AlexaCallback()
    cache = MongoCache(expires=timedelta())
    #cache.clear()
    link_crawler(scrape_callback.seed_url,
                 scrape_callback=scrape_callback,
                 cache=cache)
Пример #6
0
def main(max_threads=5):
    scrape_callback = AlexaCallback()
    cache = MongoCache()
    queue = MongoQueue()

    urls = []
    temple = scrape_callback.seed_url[0:-2]
    for i in range(1, 1189, 1):
        urls.append(temple + str(i) + '/')

    while True:
        now = datetime.now()
        if now.hour < 3 or now.hour > 12:
            queue.repairFast()
            process_crawler(
                urls,
                scrape_callback=scrape_callback,
                cache=cache,
                max_threads=max_threads,
                timeout=30,
                host=urlparse.urlparse(scrape_callback.seed_url).netloc,
                user_agent=
                'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36'
            )
        else:
            print 'pass:' + str(now)
            pass
        time.sleep(3600)
Пример #7
0
 def __init__(self, cache=MongoCache()):
     self.max_page = 0
     self.base_url = configs.MAIN_PAGE_URL
     self.page_url = configs.EACH_PAGE_URL
     self.headers = self._load_headers()
     self.video_headers = self._load_headers('headers/video_headers')
     self.cache = cache
Пример #8
0
def main():
    starttime = datetime.datetime.now()
    scrape_callback = AlexaCallback()
    cache = MongoCache()
    #cache.clear()
    threaded_crawler('http://example.webscraping.com',scrape_callback.seed_url, scrape_callback=scrape_callback)
    endtime = datetime.datetime.now()
    print((endtime - starttime).seconds)
Пример #9
0
 def test_cache_expired(self):
     cache = MongoCache(expires=timedelta())
     # every 60 seconds the cache is purged
     # http://docs.mongodb.org/manual/core/index-ttl/
     cache[self.url] = self.result
     sleep(61)
     with self.assertRaises(KeyError):
         cache[self.url]
Пример #10
0
def main():
    scrape_callback = AlexaCallback()
    cache = MongoCache()
    # cache.clear()
    link_crawler(scrape_callback.seed_url,
                 scrape_callback=scrape_callback,
                 cache=cache,
                 timeout=10,
                 ignore_robots=True)
 def __call__(self, url, html):
     urls = []
     cache = MongoCache()
     for _, website in csv.reader(open(self.seed_url)):
         if website not in cache:
             urls.append(website)
             if len(urls) == self.max_urls:
                 break
     return urls
Пример #12
0
def main(max_threads):

    cache = MongoCache()
    # cache.clear()
    threaded_crawler(
        seed_url='http://example.webscraping.com',
        scrape_callback=link_crawler('http://example.webscraping.com'),
        cache=cache,
        max_threads=max_threads,
        timeout=0)
Пример #13
0
def test():
    start_url = 'http://www.alexa.com/topsites/global;0'
    cache = MongoCache()
    scrape_callback = AlaxeCallback(allow_domains=[start_url])
    process_crawler(start_url,
                    link_regex='/topsites/global;',
                    cache=cache,
                    scrape_callback=scrape_callback,
                    max_threads=8,
                    timeout=5)
Пример #14
0
def main():
    scrape_callback = AlexaCallback()
    cache = MongoCache()
    # cache.clear()
    crawler(scrape_callback.seed_url,
            proxies=[
                '127.0.0.1:8118',
            ],
            scrape_callback=scrape_callback,
            cache=cache)
Пример #15
0
def threaded_crawler(seed_url,
                     delay=5,
                     cache=MongoCache(),
                     scrape_callback=None,
                     user_agent='wswp',
                     proxies=None,
                     num_retries=1,
                     max_threads=10,
                     timeout=10):
    """Crawl using multiple threads"""
    # the queue of url's that still need to be crawled
    crawl_queue = MongoQueue()
    crawl_queue.clear()
    crawl_queue.push(seed_url)
    D = Downloader(cache=cache,
                   delay=delay,
                   user_agent=user_agent,
                   proxies=proxies,
                   num_retries=num_retries,
                   timeout=timeout)

    def process_queue():
        while True:
            # keep track that are processing url
            try:
                url = crawl_queue.pop()
            except KeyError:
                # currently no urls to process
                break
            else:
                html = D(url)
                if scrape_callback:
                    try:
                        links = scrape_callback(url, html) or []
                    except Exception as e:
                        print 'Error in callback for; {}:{}'.format(url, e)
                    else:
                        for link in links:
                            # add this new link to queue
                            crawl_queue.push(link)
                crawl_queue.complete(url)

    # wait for all download threads to finish
    threads = []
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue.peek():
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(True)
            thread.start()
            threads.append(thread)
        time.sleep(SLEEP_TIME)  # 线程睡眠1s
Пример #16
0
 def __call__(self, url, html):
     if url == self.seed_url:
         urls = []
         cache = MongoCache()
         with ZipFile(StringIO(html)) as zf:
             csv_filename = zf.namelist()[0]
             for _, website in csv.reader(zf.open(csv_filename)):
                 if 'http://' + website not in cache:
                     urls.append('http://' + website)
                     if len(urls) == self.max_urls:
                         break
         return urls
Пример #17
0
def test(max_threads):
    start_url = 'http://www.alexa.com/topsites/global;0'
    scrape_callback = AlaxeCallback(allow_domains=[start_url])
    cache = MongoCache()
    # start_url = 'http://www.eastday.com'
    # start_url = 'http://www.qq.com'

    threaded_crawler(start_url,
                     link_regex='/topsites/global;',
                     cache=cache,
                     scrape_callback=scrape_callback,
                     max_threads=max_threads,
                     timeout=5)
Пример #18
0
 def __call__(self, url, html):
     if url == self.seed_url:
         urls = []
         cache = MongoCache()
         with ZipFile(BytesIO(html.content)) as zf:
             csv_filename = zf.namelist()[0]
             data = StringIO(zf.open(csv_filename).read().decode('utf-8'))
             for _, website in csv.reader(data):
                 if 'http://' + website not in cache:
                     urls.append('http://' + website)
                     if len(urls) == self.max_urls:
                         break
         return urls
Пример #19
0
def com_alexa():
    """
    从该网址下载一些热门网址
    """
    start_url = 'http://www.alexa.com/topsites/global;0'
    scrape_callback = AlaxeCallback(allow_domains=start_url)
    link_crawler(start_url,
                 link_regex='/topsites/global;',
                 delay=3,
                 only_same_host=False,
                 save_cache=False,
                 max_urls=100,
                 cache=MongoCache(),
                 scrape_callback=scrape_callback,
                 timeout=3)
    del scrape_callback
Пример #20
0
    def __init__(
            self, output_dir, start_date, end_date, chosen_program=None,
            use_cache=False):
        self.output_dir = output_dir
        self.start_date = start_date
        self.end_date = end_date
        self.chosen_program = chosen_program

        self.base_url = 'https://www.byte.fm'
        self.header = ["program", "date", "title", "artist", "album", "label"]
        self.parser = HTMLParser()
        if use_cache:
            from mongo_cache import MongoCache
            cache = MongoCache()
        else:
            cache = None
        self.Downloader = Downloader(cache=cache)
Пример #21
0
def main(max_threads = 5):
    catlog_callback = AlexaCallback()
    cache = MongoCache()
    queue = MongoQueue()


    client = MongoClient('localhost', 27017, connect=False)
        #create collection to store cached webpages,
        # which is the equivalent of a table in a relational database
    db = client.cache
    cursor = db.books.find()

    urls = []
    while cursor.alive:
        temp = cursor.next()
        temp = temp['link']

        if urlparse.urlparse(catlog_callback.seed_url).netloc == 'www.junzige.la':
            temp = '/novel' + temp[5:-4] + '/'
            temp = normalize(catlog_callback.seed_url, temp)
        elif urlparse.urlparse(catlog_callback.seed_url).netloc == 'www.boluoxs.com':
            temp = 'http://www.boluoxs.com/biquge/0/' + temp[temp.rfind('/') + 1 :temp.rfind('.')] + '/'

        print temp
        urls.append(temp)

    print urls[0]

    while True:
        now = datetime.now()

        if now.hour < 3 or now.hour > 12:
            queue.repairFast()
            process_crawler(urls, scrape_callback=catlog_callback, cache=cache, max_threads=max_threads, timeout=30, host = urlparse.urlparse(catlog_callback.seed_url).netloc, user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36')
            # every time finished, clear the job queue
            queue.clear()
        else:
            print 'pass:' + str(now)
            pass
        time.sleep(3600)
Пример #22
0
 def setUp(self):
     self.cache = MongoCache(default_timeout=0)
                        else:
                            stop = 1                                                      # 该链表页下的所有详情页为空, 不再增加链表页
                if 'top250' in url and stop == 0:
                    page_size += 25
                    next_link = form_url.format(page_size)
                    if next_link not in seen:
                        seen.add(next_link)
                        crawl_queue.append(next_link)
    # 等待所有的下载线程结束
    threads = []
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                # 移除已经停止的进程
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue:
            # 开始更多的线程
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(True)
            thread.start()
            threads.append(thread)
        time.sleep(np.random.randint(6, 12))



if __name__ == '__main__':
    Scrape_Back = GetDetailInfo
    Cache = MongoCache()
    Cache.clear()
    threaded_crawler(scrape_callback=Scrape_Back, cache=Cache)
Пример #24
0
class TestCache(unittest.TestCase):

    def setUp(self):
        self.cache = MongoCache(default_timeout=0)

    def tearDown(self):
        self.cache.collection.delete_many({})

    def test_get(self):
        x = MockData(1)

        self.cache.set('key-1', x)

        xc = self.cache.get('key-1')
        self.assertEqual(x, xc)

    def test_delete_existing(self):
        x = MockData(1)

        self.cache.set('key-1', x)

        self.assertTrue(self.cache.delete('key-1'))

    def test_delete_not_existing(self):
        self.assertFalse(self.cache.delete('key-1'))

    def test_set(self):
        x = MockData(1)
        self.cache.set('key-1', x)

        xc = self.cache.get('key-1')

        self.assertEqual(x, xc)

    def test_add_not_existing(self):
        x = MockData(1)

        added = self.cache.add('key-1', x)

        self.assertTrue(added)

    def test_add_existing(self):
        x = MockData(1)
        self.cache.set('key-1', x)

        y = MockData(2)
        added = self.cache.add('key-1', y)

        self.assertFalse(added)

    def test_clear(self):
        x = MockData(1)
        self.cache.set('key-1', x)

        cleared = self.cache.clear()
        xc = self.cache.get('key-1')

        self.assertTrue(cleared)
        self.assertIsNone(xc)

    def test_set_overwrite(self):
        x1 = MockData(1)
        key = 'key-set-overwrite'
        self.cache.set(key, x1)

        x2 = MockData(2)
        self.cache.set(key, x2)

        _filter = {'_id': key}
        count_keys = self.cache.collection.count(_filter)

        self.assertEqual(1, count_keys)

    def test_inc_with_exist_key(self):
        value = 10
        key = 'key-inc-with-exist-key'
        self.cache.set(key, value)

        delta = 9
        new_value = self.cache.inc(key, delta)

        value_cache = self.cache.get(key)

        result = delta + value

        self.assertEqual(result, value_cache)
        self.assertEqual(result, new_value)

    def test_inc_witho_exist_key(self):
        key = 'key-inc-without-exist-key'
        delta = 9
        new_value = self.cache.inc(key, delta)

        value_cache = self.cache.get(key)

        self.assertEqual(delta, value_cache)
        self.assertEqual(delta, new_value)

    def test_inc_with_error(self):
        value = MockData(1)
        key = 'key-inc-with-error'
        self.cache.add(key, value)

        delta = 9
        new_value = self.cache.inc(key, delta)

        value_cache = self.cache.get(key)

        self.assertEqual(value, value_cache)
        self.assertEqual(None, new_value)

    def test_has_with_add_key(self):
        key = 'key-has-with-add-key'
        value = MockData(1)

        self.cache.add(key, value)

        has_key = self.cache.has(key)

        self.assertTrue(has_key)

    def test_has_without_add_key(self):
        key = 'key-has-without-add-key'

        has_key = self.cache.has(key)

        self.assertFalse(has_key)

    def test_dec_with_exist_key(self):
        value = 10
        key = 'key-dec-with-exist-key'
        self.cache.set(key, value)

        delta = 9
        new_value = self.cache.dec(key, delta)

        value_cache = self.cache.get(key)

        result = value - delta

        self.assertEqual(result, value_cache)
        self.assertEqual(result, new_value)

    def test_dec_witho_exist_key(self):
        key = 'key-dec-without-exist-key'
        delta = 9
        new_value = self.cache.dec(key, delta)

        value_cache = self.cache.get(key)

        self.assertEqual(-delta, value_cache)
        self.assertEqual(-delta, new_value)

    def test_dec_with_error(self):
        value = MockData(1)
        key = 'key-dec-with-error'
        self.cache.add(key, value)

        delta = 9
        new_value = self.cache.dec(key, delta)

        value_cache = self.cache.get(key)

        self.assertEqual(value, value_cache)
        self.assertEqual(None, new_value)

    def test_get_many(self):
        key_x_value = {'key-%s' % i: MockData(i) for i in range(1, 11)}

        for key, value in key_x_value.items():
            self.cache.add(key, value)

        values = self.cache.get_many(*key_x_value.keys())

        self.assertEqual(10, len(values))
        for _return, _value in zip(values, key_x_value.values()):
            self.assertEqual(_value, _return)

    def test_get_dict(self):
        key_x_value = {'key-%s' % i: MockData(i) for i in range(1, 6)}

        for key, value in key_x_value.items():
            self.cache.add(key, value)

        results = self.cache.get_dict(*key_x_value.keys())

        self.assertIsInstance(results, dict)
        for key, value in key_x_value.items():
            self.assertIn(key, results)
            self.assertEqual(key_x_value[key], results[key])

    def test_delete_many(self):
        key_x_value = {'key-%s' % i: MockData(i) for i in range(1, 6)}

        for key, value in key_x_value.items():
            self.cache.add(key, value)

        self.assertEqual(5, self.cache.collection.count({'_id': {'$in': key_x_value.keys()}}))

        result = self.cache.delete_many(*key_x_value.keys())

        self.assertTrue(result)
        self.assertEqual(0, self.cache.collection.count({'_id': {'$in': key_x_value.keys()}}))

    def test_set_many(self):
        key_x_value = {'key-set-many-%s' % i: MockData(i) for i in range(1, 6)}

        result = self.cache.set_many(key_x_value)

        self.assertTrue(result)
        self.assertEqual(5, self.cache.collection.count({'_id': {'$in': key_x_value.keys()}}))
Пример #25
0
import pandas as pd
import re
import numpy as np
from process_crawler import process_crawler
from mongo_queue import MongoQueue
from mongo_cache import MongoCache
from mongo_info import MongoInfo
from downloader import Downloader
from lxml import etree

crawl_queue = MongoQueue()
webpage_cache = MongoCache()
DEFAULT_AGENT = {}
DEFAULT_DELAY = 5
DEFAULT_RETRIES = 1
DEFAULT_TIMEOUT = 100
DEFAULT_PROXY_LIST = '/Users/apple/Desktop/connect/proxylist/proxies.csv'
DEFAULT_COOKIE = {}

D = Downloader(delay=DEFAULT_DELAY, user_agent=DEFAULT_AGENT, proxies=DEFAULT_PROXY_LIST, \
        cookies = DEFAULT_COOKIE, num_retries=DEFAULT_RETRIES, timeout=DEFAULT_TIMEOUT, \
        opener=None, cache=MongoCache())

def usere(regex, getcontent): #定义使用正则表达式的函数
    pattern = re.compile(regex)
    content = re.findall(pattern, getcontent)
    return content

#Obtain target urls
startdate = '20180414'
enddate = '20180415'
    """Initialize robots parser for this domain
    """
    rp = robotparser.RobotFileParser()
    rp.set_url(urlparse.urljoin(url, '/robots.txt'))
    rp.read()
    return rp


def get_links(html):
    """Return a list of links from html
    """
    # a regular expression to extract all links from the webpage
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
    # list of all links from the webpage
    return webpage_regex.findall(html)


if __name__ == '__main__':
    link_crawler('http://example.webscraping.com',
                 '/(index|view)',
                 delay=0,
                 num_retries=1,
                 user_agent='BadCrawler')
    link_crawler('http://example.webscraping.com',
                 '/places/default/view',
                 delay=0,
                 num_retries=1,
                 max_depth=10,
                 user_agent='GoodCrawler',
                 cache=MongoCache(expires=datetime.timedelta()))
Пример #27
0

def get_links(html):
    """Return a list of links from html 
    """
    # a regular expression to extract all links from the webpage
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
    # list of all links from the webpage
    return webpage_regex.findall(html)


if __name__ == '__main__':
    from mongo_cache import MongoCache

    class CallBack:
        def __init__(self, filename='log.txt'):
            self.file = open(filename, 'w+')

        def __call__(self, url, html):
            self.file.write("{}\n".format(url))

    cache = MongoCache()
    link_crawler('http://example.webscraping.com/places/default',
                 '/places/default/(index|view)',
                 delay=1,
                 num_retries=1,
                 max_depth=3,
                 user_agent='GoodCrawler',
                 cache=cache,
                 scrape_callback=CallBack())
Пример #28
0
def main():
    scrape_callback = AlexaCallback()
    cache = MongoCache()
    cache.clear()
    link_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, user_agent='GoodCrawler', ignore_robots=True)
Пример #29
0
# -*- coding: utf-8 -*-
from datetime import timedelta

from pymongo import MongoClient

from mongo_cache import MongoCache

cache = MongoCache()
cache.clear()
url = 'http://example.webscraping.comasdf'
result = {'html': '...'}
cache[url] = result
print(cache[url]['html'] == result['html'])
cache = MongoCache(expires=timedelta())
cache[url] = result
import time
time.sleep(60)
print(cache[url])
Пример #30
0
				link = normalize(seed_url,link)
				if link not in seen:
					seen[link] = depth+1
					if same_domain(link,seed_url):
						crawl_queue.append(link)
						#print('crawl_queue=',crawl_queue)
		num +=1
		print('num=',num)
		num_urls +=1
		if num_urls == max_urls:
				break


def get_links(html):
	webpage_regex=re.compile('<a href="position.php\?(.*?)"',re.IGNORECASE)
	#print('webpage_regex.findall(html)=',webpage_regex.findall(html))
	return webpage_regex.findall(html)

def normalize(seed_url,link):
	link,_=urllib.parse.urldefrag(link)
	return urllib.parse.urljoin(seed_url,link)

def same_domain(url_1,url_2):
	return urllib.parse.urlparse(url_1).netloc == urllib.parse.urlparse(url_2).netloc

link_crawler('https://hr.tencent.com/position.php?keywords=python', 'keywords=python&start=', cache=MongoCache())




Пример #31
0
def main(max_threads):
    scrape_callback = AlexaCallback()
    cache = MongoCache()
    cache.clear()
    process_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, max_threads=max_threads, timeout=10)
Пример #32
0
from fangjia_thread_crawler import thread_crawler
from fangjia_cb import FangjiaCallback
from mongo_cache import MongoCache
from downloader import Downloader
from fangjia2 import get_search
from fangjia2 import get_info_list
from fangjia2 import download
import pandas as pd
import cPickle
import os

if __name__ == '__main__':
    # get the seed_urls
    starttime = datetime.datetime.now()
    seed_urls = []
    cache = MongoCache()  # cache all pages
    if os.path.exists('seed_urls.pkl'):
        with open('seed_urls.pkl', 'rb') as fp:
            seed_urls = cPickle.load(fp)
    else:
        base_url = r'http://cd.fangjia.com/ershoufang/'
        search_list = []  # 房源信息url列表
        tmp_list = []  # 房源信息url缓存列表
        layer = -1
        # 一级筛选
        #D = Downloader(cache=cache)
        page = download(base_url)
        search_dict = get_search(page, 'r-')
        # 二级筛选
        for k in search_dict:
            print u'****************一级抓取:正在抓取【%s】***************' % k
Пример #33
0
class TestTimeout(unittest.TestCase):

    def setUp(self):
        self.cache = MongoCache()

    def tearDown(self):
        self.cache.collection.delete_many({})

    def test_set(self, mock_time):
        key = 'key-set'
        mock_time.return_value = 100

        self.cache.set(key, MockData(1), timeout=300)

        doc = self.cache.collection.find_one({'_id': key})

        self.assertIn('expires', doc)
        self.assertEqual(400, doc['expires'])
        self.assertIn('value', doc)

    def test_get_not_expired(self, mock_time):
        key = 'key-not-expired'
        mock_time.return_value = 100

        self.cache.set(key, MockData(1), timeout=100)

        mock_time.return_value = 150
        result = self.cache.get(key)

        self.assertIsNotNone(result)
        self.assertEqual(result, MockData(1))

    def test_get_timeout_0(self, mock_time):
        key = 'key-not-expired'
        mock_time.return_value = 100

        self.cache.set(key, MockData(1), timeout=0)

        result = self.cache.get(key)

        self.assertIsNotNone(result)
        self.assertEqual(result, MockData(1))

    def test_get_expired(self, mock_time):
        key = 'key-get-expired'
        mock_time.return_value = 100

        self.cache.set(key, MockData(1), timeout=100)

        mock_time.return_value = 201
        result = self.cache.get(key)

        self.assertIsNone(result)

    def test_get_many_expired(self, mock_time):
        key_timeout_1 = 'key-timeout-1'
        key_timeout_100 = 'key-timeout-100'

        mock_time.return_value = 100

        self.cache.set(key_timeout_1, MockData(1), timeout=1)
        self.cache.set(key_timeout_100, MockData(1), timeout=100)

        mock_time.return_value = 150

        results = self.cache.get_many(*[key_timeout_1, key_timeout_100])

        self.assertIsNone(results[0])
        self.assertIsNotNone(results[1])
Пример #34
0
 def setUp(self):
     self.cache = MongoCache()
Пример #35
0
def link_crawler(seed_url,link_regex_large,link_regex_small,max_depth=2,max_threads=5):
	'Crawl from the given seed URL following links matchedly by link_regex'
	print 'seed_ur',seed_url
	#crawl_queue=[seed_url]
	crawl_queue=Mongo_Queue()
	#seen={seed_url:0}#no need this seen for Mongo_Queue will take care of duplicate url
	#crawl_queue=Mongo_Queue.push(seed_url)
	crawl_queue.push(seed_url)
	depth=(crawl_queue.get_item(seed_url))['depth']
	print 'seedurldepth:',depth
	cache=MongoCache()
	D=Download(cache=cache)
	#result_links=set()
	csvFile=open('D:/Work/Projects/realestate/app/static/163_money.csv','wb')
	writer=csv.writer(csvFile)

	def process_queue():
		'extract the page_download part as a function, so that every tread can call it to download page'
		while True:
			try:
				url=crawl_queue.pop()
			except KeyError:
				#no url in crawl_queue
				break
			else:
				depth=(crawl_queue.get_item(url))['depth']
				'depth=128,129'
				#print depth
				if depth<=max_depth:
					html=D(url)
					links=re.findall(link_regex_large,html)

					for link in links:
						if re.match(link_regex_small,link):
							writer.writerow((link,''))
							#writer.writerow((link,''))
							print link
						else:
							crawl_queue.push(link,depth+1)
							#encoding=chardet.detect(link)
							#link=link.decode(encoding).encode('utf-8')
							#crawl_queue.push(link,depth+1)
							#seen[link]=depth+1
				crawl_queue.complete(url)
								
	threads=[]
	while crawl_queue or threads:
		while len(threads)<max_threads and crawl_queue:
			#can start some more threads
			thread=threading.Thread(target=process_queue)
			#daemon's value must be set before start(), or RuntimeError will rarise. set deamon=Ture ,so that main thread can exit when receieve ctrl-c
			thread.setDaemon(True)
			thread.start()
			threads.append(thread)

		for thread in threads:
			if not thread.is_alive():
				#remove the stopped threads
				threads.remove(thread)

	csvFile.close()