Пример #1
0
def main():
    """TODO...
    ADD LOGGING FOR:
    1. Process start
    2. Collection/write start
    3. Collection/write end
    4. Process end
    """
    start_time = time.time()
    print(f'Starting {PROJECT_NAME}')

    # No need to hit API to collect data if we can just use the sample data
    if not LOCAL:
        # Authenticate Reddit w/ credentials
        reddit = Reddit()
        # Retrieve top posts from subreddit
        post_data = reddit.collect_data()
        # Dump post data to file as JSON
        write_data(post_data)
    else:
        print('LOCAL: True; skipping PRAW to use sample-reddit.json')

    # Download images using data recently saved in JSON file
    Downloader.download_images()

    time_elapsed = round(time.time() - start_time, 3)
    print(f'Finished {PROJECT_NAME} in {time_elapsed} seconds.')
Пример #2
0
def main():
    #docopt parse the args
    arguments = docopt(__doc__, version='0.1.1-dev') #TODO get the version from __init__.py
    
    host = arguments.get('--host', 'mafreebox.free.fr')
    port = arguments.get('--port', '80')
    password = arguments.get('--password', None)
    if not password:
        password = getpass('Freebox password: '******'client', False)
    download_mode = arguments.get('download', False)
    
    #login 
    fbx_client = freeboxClient(host, port, None, password)#params are host, port, username, password
    login_sucess = fbx_client.login()
    if client_mode:
        print "login success : %s" % login_sucess
    
    #if download, call the downloader
    if download_mode:
        path_or_url = arguments.get('PATH_OR_URL', None)
        if not path_or_url:
            print u"Give a file torrent path or url"
        #no we will find if path_or_url is a path or an url
        torrent_file = None
        torrent_url = None
        
        if os.path.isfile(path_or_url):
            torrent_file = path_or_url
        else:
            torrent_url = path_or_url
        downloader = Downloader(fbx_client)
        downloader.add_file_to_download(torrent_file, torrent_url)
Пример #3
0
class DownloaderTestCase(unittest.TestCase):

    master_playlist_url = 'https://rtsvodww-vh.akamaihd.net/i/1998/vers/vers_19980407_standard_vers_1998-04-07_Arch00_094728-,100k,700k,1200k,.mp4.csmil/master.m3u8'
    index_playlist_url = 'https://rtsvodww-vh.akamaihd.net/i/1998/vers/vers_19980407_standard_vers_1998-04-07_Arch00_094728-,100k,700k,1200k,.mp4.csmil/index_2_av.m3u8'

    def setUp(self):
        self.downloader = Downloader()
    
    def test_get_master_playlist_url(self):
        page_url = 'https://www.rts.ch/archives/tv/culture/verso/4716197-gribouille-en-metro.html'
        self.assertEqual(self.downloader.get_master_playlist_url(page_url), 
                         self.master_playlist_url)

    def test_get_index_playlist_url(self):
        self.assertEqual(
            self.downloader.get_index_playlist_url(self.master_playlist_url),
            self.index_playlist_url
            )

    def test_get_segment_urls(self):
        urls = self.downloader.get_segment_urls(self.index_playlist_url)
        num_segments = 33
        self.assertEqual(len(urls), num_segments)

        for i in range(num_segments):
            self.assertEqual(urls[i], 'https://rtsvodww-vh.akamaihd.net/i/1998/vers/vers_19980407_standard_vers_1998-04-07_Arch00_094728-,100k,700k,1200k,.mp4.csmil/segment' + str(i+1) + '_2_av.ts')
Пример #4
0
 def __init__(self, file_info):
     Downloader.__init__(self)
     self.start = None
     self.end = None
     self.file_info = file_info
     self.file = None
     self.cal_offset()
     self.downloaded_byte = 0
Пример #5
0
def download(dataset):
    log('Download dataset {}...'.format(dataset['id']))
    if dataset['type'] == 'url':
        Downloader(CurlDownloader(), dataset['localFolder'],
                   dataset['sourceUrls']).perform()
    elif dataset['type'] == 'kaggle':
        Downloader(KaggleDownloader(), dataset['localFolder'],
                   dataset['sourceUrls']).perform()
    log('Done!')
Пример #6
0
def main():
    login()
    #songsList = getSongsFromInternet()
    songsList = getSongsFromLocalarea("songsList.txt")
    #partition(songsList)
    downloader = Downloader(songsList)
    downloader.download(3, threadNum=10)
    #print(songsList)
    print("done.")
Пример #7
0
 def __init__(self, url_q, wb_q, info_q):
     self.url_count = 1
     self.handle_urls = set()
     self.url_suf = 'http://weibo.com'
     self.url_q = url_q
     self.wb_q = wb_q
     self.info_q = info_q
     self.downloader = Downloader()
     self.validater = Validater()
Пример #8
0
def no_proxy():
    if request.method == 'POST':
        title = request.form["ocw"]
        long_title = media[title].split('/')[-1]
        filepath = urljoin("downloads/", long_title)
        downloader = Downloader()
        downloader.get(media[title], filepath)
        flash('Successfully downloaded {0}'.format(long_title))
    return render_template('proxy.html', page_title="No proxy", action="no_proxy")
Пример #9
0
 def test_download_to_dir(self):
   dir = tempfile.mkdtemp()
   try:
     d = Downloader(dir)
     with util.CaptureStdout():
       with d.download('file://' + __file__) as f:
         filename = f
   finally:
     shutil.rmtree(dir)
   self.assertEqual(dir, os.path.dirname(filename))
Пример #10
0
 def test_file_removed_on_exception(self):
   try:
     d = Downloader()
     with util.CaptureStdout():
       with d.download('file://' + __file__) as f:
         filename = f
         raise TestException()
   except TestException:
     pass
   self.assertFalse(os.path.exists(filename))
Пример #11
0
def main():
    # tie_url = 'http://tieba.baidu.com/p/4774287212'
    tie_url = input('请输入帖子url:')
    tiebaimg = TiebaImage()
    img_url_list = tiebaimg.getImgUrls(tie_url)
    if not img_url_list:
        print('未解析到图片url!!!')
        return
    downloader = Downloader()
    downloader.start(img_url_list)
Пример #12
0
    def __init__(self):
        if not os.path.isdir(PLAYLIST_DIR):
            print(f'No config found at {PLAYLIST_DIR}')
            exit(1)

        if not os.path.isdir(DOWNLOADS_DIR):
            os.mkdir(DOWNLOADS_DIR)

        self.downloader = Downloader(DOWNLOADS_DIR)
        self.audio = None
Пример #13
0
	def run(self):
		while self.current < self.end:
			self.printCurrent()
			
			# Load pageview data
			artDict = self.buildArtDict()
			print artDict

			# Process pageview data
			dl = Downloader(self.current,self.current+timedelta(days=1))
			dl.run(artDict)

			self.updateCurrentTime()
Пример #14
0
def test_rule(url, regexp=''):
    download = Downloader()
    html1 = download.get(url)
    #print html1
    text1 = process_selector(selector, html1.text)
    md51 = md5(text1.encode('utf-8'))
    html2 = download.get(url)
    text2 = process_selector(selector, html2.text)
    md52 = md5(text2.encode('utf-8'))
    if md51 == md52:
        print 'md5 is same'
    else:
        print md51, md52
Пример #15
0
def download(url='', title='', artist='', gender='', album=''):
    cleanMp3s()
    url = request.form['url']
    title = request.form['title']
    artist = request.form['artist']
    gender = request.form['gender']
    album = request.form['album']
    downloader = Downloader(url, title, artist, gender, album)
    try:
        path = downloader.download()
    except IOError as e:
        return str(e)
    return send_from_directory(os.path.abspath('.'), path, as_attachment=True)
Пример #16
0
def get_data(url):
	print(url)
	down = Downloader(headers=headers_home)
	path = 'cache/hz.meituan.com/index.html'
	if os.path.exists(path):
		os.remove(path)
	uuid = get_uuid('http://hz.meituan.com/', down)
	if not uuid:
		return
	data = {}
	type_ = 'c' + url.split('/c')[-1][:-1]
	print(type_)
	cateId = type_[1:]
	areaId = '-1'
	# print(cateId, areaId)
	data['FIRST_LEVEL_DIRECTORY'] = '生活服务'
	data['SECOND_LEVEL_DIRECTORY'] = class_[type_]
	down.headers = headers_get
	index = 0
	while True:
		index = index + 1
		down.headers['Referer'] = url + '/' + 'pn' + str(index) + '/'
		url_get = 'http://apimobile.meituan.com/group/v4/poi/pcsearch/50?uuid='+uuid+'&userid=-1&limit=32&offset='+str((index-1)*32)+'&cateId='+cateId+'&areaId='+areaId
		html = down(url_get)
		try:
			search_result = json.loads(html)['data']['searchResult']
		except Exception as e:
			print('in get_data error ',e)
		if search_result == []:
			print('search_result is None')
			break
		# print(search_result)
		for one_item in search_result:
			data['SHOP_ID'] = one_item['id']
			data['SHOP_PHOTOS'] = one_item['imageUrl']
			data['SHOP_NAME'] = one_item['title']
			data['ADDRESS'] = one_item['address']
			data['RANK_STARS'] = one_item['avgscore']
			data['AVG_PRICE_TITLE'] = one_item['avgprice']
			tuangou = one_item['deals']
			if not tuangou:
				data['GROUP_BUYING_NUMBER'] = 0
				data['GROUP_BUYING'] = None
			else:
				data['GROUP_BUYING_NUMBER'] = len(tuangou)
				taocan = ''
				for one in tuangou:
					taocan = taocan + '价格' + str(one['price']) + ' 门市价' + str(one['value']) + ' 出售' + str(one['sales'])
				data['GROUP_BUYING'] = taocan
			db.insert_into(data)
Пример #17
0
def downloadlink(url='', title='', artist='', gender='', album=''):
    cleanMp3s()
    url = request.form['url']
    title = request.form['title']
    artist = request.form['artist']
    gender = request.form['gender']
    album = request.form['album']
    downloader = Downloader(url, title, artist, gender, album)
    path = downloader.download()
    dir = 'files/'
    if not os.path.exists(dir):
            os.makedirs(dir)
    newpath = dir + path
    os.rename(path, newpath)
    return '<a href="/' + newpath + '">' + newpath + '</a>'
Пример #18
0
def threaded_crawler(max_threads=10):
    # urls that still need to be crawled
    crawl_queue = [seed_url]
    # urls that have been seen
    seen = set([seed_url])
    D = Downloader(cache=cache,
                   delay=delay,
                   user_agent=user_agent,
                   proxies=proxies,
                   num_retries=num_retries,
                   timeout=timeout)

    def proxess_queue():
        while True:
            try:
                url = crawl_queue.pop()
            except IndexError:
                # empty queue
                break
            else:
                html = D(url)
                threads = []
                while threads or crawl_queue:
                    for thread in threads:
                        if not thread.is_alive():
                            threads.remove(thread)
                    while len(threads) < max_threads and crawl_queue:
                        # can start some more threads
                        thread = threading.Tread(target=proxess_queue)
                        thread.setDaemon(True)
                        thread.start()
                        threads.append(thread)

                    time.sleep(SLEEP_TIME)
Пример #19
0
 def __init__(self, master=True):
     threading.Thread.__init__(self)
     self.pagestore = PageStore()
     
     self.downloader = Downloader();
     self.connection = Connection(MONGODB_HOST, MONGODB_PORT)
     db = self.connection.download
     if master:
         db.drop_collection('downurl')
         for f, tb in ((SAVE_URL_RE_BLACK, 'save_url_black'),
             (SAVE_URL_RE_WHITE, 'save_url_white'),
             (DOWN_URL_RE_BLACK, 'down_url_black'),
             (DOWN_URL_RE_WHITE, 'down_url_white')):
             if os.path.exists(f):
                 db.drop_collection(tb)
                 logger.info('load rule:%s...' % f)
                 for s in set(open(f).readlines()):
                     s = s.strip()
                     if s:
                         db[tb].insert({'pattern': s})
                 logger.info('load rule:%s...OK' % f)
     self.downurl, self.allurl, self.watchurl, self.updateurl, self.secceedurl = db.downurl, db.allurl, db.watchurl, db.updateurl, db.secceedurl
     self.save_url_black = self.load_re(db.save_url_black)
     self.save_url_white = self.load_re(db.save_url_white)
     self.down_url_black = self.load_re(db.down_url_black)
     self.down_url_white = self.load_re(db.down_url_white)
     if master:
         self.load_watch_url()
         self.load_update_url()
         self.reload_allurl()
         logger.info('allurl:%d' % self.allurl.find().count())
         logger.info('secceedurl:%d' % self.secceedurl.find().count())
         logger.info('updateurl:%d' % self.updateurl.find().count())
         logger.info('watchurl:%d' % self.watchurl.find().count())
         logger.info('downurl:%d' % self.downurl.find().count())
Пример #20
0
def install_from_url(url, install_location):
    if not os.path.isdir(install_location):
        raise DictError("Specified path is not a valid directory")

    if not os.access(install_location, os.W_OK):
        raise DictError("User not allowed to write to specified directory")

    if os.path.isdir(UNZIPPED_TEMP):
        shutil.rmtree(UNZIPPED_TEMP)

    if os.path.isdir(TEMP_DICT):
        shutil.rmtree(TEMP_DICT)

    os.mkdir(TEMP_DICT)
    os.mkdir(UNZIPPED_TEMP)

    #file_name = TEMP_DICT + "/mod.zip"
    #file_name = download_file(url)
    #file_name = Downloader(url, "")

    downloader = Downloader(url, TEMP_DICT)
    downloader.show()
    downloader.exec_()

    file_name = str(downloader.file_name)

    unzip2(file_name, UNZIPPED_TEMP)
    up_one = False

    for path, dirs, files in os.walk(UNZIPPED_TEMP):
        for f in reversed(dirs):
            print("In folder: " + f)
            if f == "GameData":
                up_one = True
                tree = path + "/" + f
                print("GameData Detected, copying to GameData folder and deleting: " + tree)
                copy_and_delete_tree(tree, install_location)

    if up_one:
        print("Upped one\n")
        copytree(UNZIPPED_TEMP, install_location + "/..")
    else:
        copytree(UNZIPPED_TEMP, install_location)

    os.remove(file_name)

    print("Finished\n")
Пример #21
0
def graph(pageid):
    """
    For a given page ID pageid,
    create a graph based on the JSON file of the HTML.
    """
    D = Downloader()
    html = D("http://graph.facebook.com/" + pageid)
    return json.loads(html)
Пример #22
0
    def unzip(self, downloader: Downloader):
        """Unzips the package(s) for this project.

        In the Project class there is only one package but derived classes can
        specify more than one package to unzip.

        Parameters
        ----------
        downloader : Downloader
            The downloader that was used to download the package(s).
        """

        split_name = self.name.split("/")
        if len(split_name) == 1:
            downloader.unzip(split_name[0])
        else:
            downloader.unzip(split_name[1])
Пример #23
0
def link_crawler(seed_url,
                 link_regex=None,
                 delay=5,
                 max_depth=-1,
                 max_urls=-1,
                 user_agent='wswp',
                 proxies=None,
                 num_retries=1,
                 scrape_callback=None,
                 cache=None):
    """Crawl from the given seed URL following links matched by link_regex
    """
    # the queue of URL's that still need to be crawled
    crawl_queue = [seed_url]
    # the URL's that have been seen and at what depth
    seen = {seed_url: 0}
    # track how many URL's have been downloaded
    num_urls = 0
    rp = get_robots(seed_url)
    D = Downloader(delay=delay,
                   user_agent=user_agent,
                   proxies=proxies,
                   num_retries=num_retries,
                   cache=cache)

    while crawl_queue:
        url = crawl_queue.pop()
        depth = seen[url]
        # check url passes robots.txt restrictions
        if rp.can_fetch(user_agent, url):
            html = D(url)
            links = []
            if scrape_callback:
                links.extend(scrape_callback(url, html) or [])

            if depth != max_depth:
                # can still crawl further
                if link_regex:
                    # filter for links matching our regular expression
                    links.extend(link for link in get_links(html)
                                 if re.match(link_regex, link))

                for link in links:
                    link = normalize(seed_url, link)
                    # check whether already crawled this link
                    if link not in seen:
                        seen[link] = depth + 1
                        # check link is within same domain
                        if same_domain(seed_url, link):
                            # success! add this new link to queue
                            crawl_queue.append(link)

            # check whether have reached downloaded maximum
            num_urls += 1
            if num_urls == max_urls:
                break
        else:
            print 'Blocked by robots.txt:', url
Пример #24
0
 def crawl(self):
     for conf in config_lists:
         for url in conf['urls']:
             resp = Downloader().download(url, conf)
             if resp:
                 proxy_list = PageParser().parse(resp, conf)
                 print(proxy_list)
                 print('正在验证代理可以用性')
                 valid_many(proxy_list, 'spider')
Пример #25
0
def main(reparse=False):
    """Main entry point for this ETL process.  Downloads, updates db,
    stores the nightly data.

    This is the binary to run from a cron job.

    """

    os.chdir(os.path.dirname(__file__))
    logger = log.logger()
    logger.info('Starting ETL of FBO Nightly data.')

    # Figure out where we put data
    datadir = get_datadir()
    dbdir = get_dbdir()
    if not os.path.exists(os.path.join(dbdir, "sqlite3")):
        os.makedirs(os.path.join(dbdir, "sqlite3"))

    # Get a database connection, create db if needed
    db = model.FBO(
        "development",
        db_conf_file=os.path.join(
            dbdir,
            "dbconf.yml"))

    # Make sure the db schema is up to date, create tables, etc.
    db.migrate()

    assert os.path.exists(datadir)

    # Download raw data files
    dloader = Downloader(datadir, db, 'nightly')
    dloader.download(fname_urls, True)

    # Do our ETL
    nights = Nightlies(db)
    nights.etl_from_dir(reparse=reparse)

    # Close the db connection
    db.close()

    info('Finished ETL of FBO data.')
Пример #26
0
def install_ampl(filename, **kwargs):
    if installed('ampl'):
        return
    dir = filename.replace('.tgz', '')
    url = 'http://ampl.com/demo/' + filename
    install_dir = kwargs.get('install_dir', opt_dir)
    with Downloader(kwargs.get('download_dir', '.')).download(url) as f:
        with closing(tarfile.open(f, 'r:gz')) as archive:
            archive.extractall(install_dir)
    add_to_path(os.path.join(install_dir, dir, 'ampl'))
    add_to_path(os.path.join(install_dir, dir, 'ampl.lic'))
Пример #27
0
def search(keyword):
    D = Downloader()
    url = 'https://www.google.com/search?q=' + urllib.quote_plus(keyword)
    html = D(url)
    tree = lxml.html.fromstring(html)
    links = []
    for result in tree.cssselect('h3.r a'):
        link = result.get('href')
        qs = urlparse.urlparse(link).query
        links.extend(urlparse.parse_qs(qs).get('q', []))
    return links
Пример #28
0
def threaded_crawler(seed_url,
                     delay=5,
                     user_agent='wswp',
                     proxies=None,
                     num_retries=1,
                     max_threads=10,
                     timeout=60,
                     scrape_callback=None,
                     cache=None):
    """Crawl this website in multiple threads
    """
    #crawl_queue = Queue.deque([seed_url])
    crawl_queue = [seed_url]
    seen = set([seed_url])
    D = Downloader(cache=cache,
                   delay=delay,
                   user_agent=user_agent,
                   proxies=proxies,
                   num_retries=num_retries,
                   timeout=timeout)

    def process_queue():
        while True:
            try:
                url = crawl_queue.pop()
            except IndexError:
                break
            else:
                html = D(url)
                if scrape_callback:
                    try:
                        links = scrape_callback(url, html) or []
                    except Exception as e:
                        print 'Error in callback for: {}: {}'.format(url, e)
                    else:
                        for link in links:
                            link = normalize(seed_url, link)
                            print link
                            if link not in seen:
                                seen.add(link)
                                crawl_queue.append(link)

    threads = []
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue:
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(True)
            thread.start()
            threads.append(thread)
        time.sleep(SLEEP_TIME)
Пример #29
0
def install_maven(**kwargs):
    if installed('mvn'):
        return
    # 3.2.5 is the most recent version of Maven compatible with Java 6.
    dir = 'apache-maven-3.2.5'
    url = 'http://mirrors.sonic.net/apache/maven/maven-3/3.2.5/binaries/{0}-bin.tar.gz'.format(
        dir)
    install_dir = kwargs.get('install_dir', opt_dir)
    with Downloader(kwargs.get('download_dir', '.')).download(url) as f:
        with closing(tarfile.open(f, 'r:gz')) as archive:
            archive.extractall(install_dir)
    add_to_path(os.path.join(install_dir, dir, 'bin', 'mvn'))
Пример #30
0
 def __init__(self):
     self.today = time.strftime("%Y-%m-%d",time.localtime(time.time()))
     self.urllogpath = "../data/url"
     os.system("mkdir -p %s" % self.urllogpath)
     self.urllog = "../data/url/" + "downloadedurl_" + self.today + ".txt"
     self.subpagepath = "../data/subpagepath"
     os.system("mkdir -p %s" % self.subpagepath)
     
     self.baseurl = BASEURLS 
     self.suburl = {}
     self.downloader = Downloader()
     self.html2db = Html2db()
Пример #31
0
def link_crawler(seed_url,
                 link_regex=None,
                 delay=5,
                 max_depth=-1,
                 max_urls=-1,
                 headers=None,
                 user_agent='wswp',
                 proxies=None,
                 num_retries=1,
                 scrape_callback=None,
                 cache=None):
    crawl_queue = Queue.deque([seed_url])
    seen = {seed_url: 0}
    num_urls = 0
    # rp = get_robots(seed_url)
    D = Downloader(delay=delay,
                   user_agent=user_agent,
                   proxies=proxies,
                   num_retries=num_retries,
                   cache=cache)
    thrtl = throttle.Throttle(delay)
    headers = headers or {}
    if user_agent:
        headers['User-agent'] = user_agent

    while crawl_queue:
        url = crawl_queue.pop()
        if True:  # rp.can_fetch(user_agent, url):
            print url
            thrtl.wait(url)
            html = D(url)
            links = []
            if scrape_callback:
                links.extend(scrape_callback(url, html) or [])
            depth = seen[url]
            if depth != max_depth:
                if link_regex:
                    links.extend(link for link in get_links(html)
                                 if re.match(link_regex, link))
                for link in links:
                    link = normalize(seed_url, link)

                    if link not in seen:
                        seen[link] = depth + 1
                        # if same_domain(seed_url, link):
                        crawl_queue.append(link)

            num_urls += 1
            if num_urls == max_urls:
                break
        else:
            print 'Blocked by robots.txt:', url
Пример #32
0
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60):
    """Crawl this website in multiple threads
    """
    # the queue of URL's that still need to be crawled
    #crawl_queue = Queue.deque([seed_url])
    crawl_queue = [seed_url]
    # the URL's that have been seen 
    seen = set([seed_url])
    D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout)

    def process_queue():
        while True:
            try:
                url = crawl_queue.pop()
            except IndexError:
                # crawl queue is empty
                break
            else:
                html = D(url)
                if scrape_callback:
                    try:
                        links = scrape_callback(url, html) or []
                    except Exception as e:
                        print 'Error in callback for: {}: {}'.format(url, e)
                    else:
                        for link in links:
                            link = normalize(seed_url, link)
                            # check whether already crawled this link
                            if link not in seen:
                                seen.add(link)
                                # add this new link to queue
                                crawl_queue.append(link)


    # wait for all download threads to finish
    threads = []
    while threads or crawl_queue:
        # the crawl is still active
        for thread in threads:
            if not thread.is_alive():
                # remove the stopped threads
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue:
            # can start some more threads
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(True) # set daemon so main thread can exit when receives ctrl-c
            thread.start()
            threads.append(thread)
        # all threads have been processed
        # sleep temporarily so CPU can focus execution on other threads
        time.sleep(SLEEP_TIME)
Пример #33
0
def process(rules):
	for rule in rules:
		download = Downloader()
		html = download.get(rule.url)
		if html == None:
			logger.error('%s无法访问'%rule.corp)
			continue
		elif rule.selector:
			text = process_selector(rule,html.text)
		elif rule.types == 'github':
			rule.selector = "div.commit-group-title"
			text = process_selector(rule,html.text)
		else:
			text = html.text
		if text == None:
			continue
		hash_list = dataConfig.hash_list()
		html_md5 = md5(text.encode('utf-8')) #text编码为unicode
		if debug:
			print 'html:',text[:20]
			print 'hash_list:',hash_list
			print 'html_md5',html_md5
		
		if len(hash_list) > 0:
			if rule.corp in hash_list.keys():
				if html_md5 == hash_list[rule.corp]:
					logger.info('%s no change'%rule.corp)
				else: #如果hash改变,说明有更新,发送邮件通知
					logger.warning('%s has update'%rule.corp)
					dataConfig.update_hash(rule.corp,html_md5)
					context = '<a href={0}>{0}</a>'.format(rule.url)
					Notification(rule.message).notification(context)
			else: #如果不存在该corp,则添加该hash
				logger.info('添加新的监控app:%s'%rule.corp)
				dataConfig.add_hash(rule.corp,html_md5)
		else: #如果hash列表为空,则先初始化
			logger.info('wam init ....')
			dataConfig.add_hash(rule.corp,html_md5)
Пример #34
0
def search(keyword):
    """
    Google search for a keyword.
    """
    D = Downloader()
    url = "https://www.google.com/search?q=" + ul.quote_plus(keyword)
    html = D(url)
    tree = lxml.html.fromstring(html)
    links = []
    for result in tree.cssselect("h3.r a"):
        link = result.get("href")
        qs = ulp.urlparse(link).query
        links.extend(ulp.parse_qs(qs).get("q", []))
    return links
Пример #35
0
 def main(self):
     n = 0  #初始化,第一次运行函数
     dw = Downloader(self.url)
     dw.GetHtml()
     parser = Parser(dw.GetHtml())
     self.list = parser.ReturnUrl()
     urlMa = UrlManager(parser.ReturnUrl())
     while n < 100:  #此处开始循环
         crawingurl = urlMa.print_2()[0]
         print(crawingurl)
         dw = Downloader(crawingurl)
         dw.GetHtml()
         parser = Parser(dw.GetHtml())
         print(parser.ReturnTitle())
         urlMa = UrlManager(parser.ReturnUrl())
         n = n + 1
Пример #36
0
    def init_connection(self):
        try:
            self.vk_session = vk_api.VkApi(login=os.getenv("LOGIN"),
                                           password=os.getenv("PASSW"))
            try:
                self.vk_session.auth(token_only=True)
            except vk_api.AuthError as e:
                print(e)
                sys.exit(0)
            except vk_api.exceptions.Captcha as e:
                print("CAPTCHA")
                print(e.get_url())
                code = input()
                e.try_again(key=code)

            print("ID:", os.getpid())
            print("Got VK API Session")
            self.group_session = vk_api.VkApi(token=os.getenv("KEY"))
            print("Got Group Session")
            self.longpoll = VkBotLongPoll(self.group_session,
                                          os.getenv("GROUP_ID"))
            print("Got Longpoll Object")
            self.api = self.vk_session.get_api()
            print("Got API Object")
            self.group_api = self.group_session.get_api()
            print("Got Group API Object")
            self.upload = vk_api.VkUpload(self.vk_session)
            print("Got Upload Object")
            self.loader = Downloader()
            print("Got Downloader Object")
        except (requests.exceptions.ConnectionError) as e:
            print("Reinitializing session data")
            print(e)
            print("Timeout:", self.timeout)
            time.sleep(self.timeout)
            self.timeout += 1
            self.init_connection()
Пример #37
0
def down_info_by_id(one_id=None):
	if not one_id:
		return None
	data = {}
	down = Downloader(headers=headers_home)
	id = one_id['SHOP_ID']
	sql = 'update crawler.mt_meishi set LABEL_IS_CCRAWLED = 2 where SHOP_ID = ' + id
	db.update_data(sql)
	url = HOMEURL + id + '/'
	uuid, data['TELEPHONE'], data['BUSINESS_TIME'] = get_uuid_phone_openTime_wifi(url, down)
	if uuid:
		data['REVIEW_COUNT'], data['NETIZEN_EVALUTION'] = get_review(uuid, id, url, down)
		if data['NETIZEN_EVALUTION'] == None:
			return
		limit = ''' '''
		for key, value in data.items():
			if data[key] != None:
				if type(data[key]) == int:
					limit = limit + str(key) + "=" + str(data[key]) + ","
				else:
					limit = limit + str(key) + "=" + "'" + data[key] + "'" + ","
		limit = limit[:-1]
		sql = 'update crawler.mt_meishi set ' + limit + ' where SHOP_ID = ' + id
		db.update_data(sql)
	else:
		print('uuid is None')
		return
	limit = ''
	sql = ''
	now_time = datetime.now()
	now_time = str(now_time)
	now_time = now_time.split('.')[0]
	data['UPDATE_TIME'] = now_time
	data['LABEL_IS_CCRAWLED'] = 1
	try:
		for key, value in data.items():
			if data[key] != None:
				if type(data[key]) == int:
					limit = limit + str(key) + "=" + str(data[key]) + ","
				else:
					limit = limit + str(key) + "=" + "'" + data[key] + "'" + ","
		limit = limit[:-1]
		sql = 'update crawler.mt_meishi set ' + limit + ' where SHOP_ID = ' + id
		db.update_data(sql)
	except Exception as e:
		print(e)
		pass
Пример #38
0
def install_cmake(package, **kwargs):
    if kwargs.get('check_installed', True) and installed('cmake'):
        return
    dir, version, minor = re.match(r'(cmake-(\d+\.\d+)\.(\d+).*-[^\.]+)\..*',
                                   package).groups()
    # extractall overwrites existing files, so no need to prepare the
    # destination.
    url = 'https://cmake.org/files/v{0}/{1}'.format(version, package)
    install_dir = kwargs.get('install_dir', opt_dir)
    with Downloader(kwargs.get('download_dir', '.')).download(url) as f:
        iszip = package.endswith('zip')
        with zipfile.ZipFile(f) if iszip \
             else closing(tarfile.open(f, 'r:gz')) as archive:
            archive.extractall(install_dir)
    dir = os.path.join(install_dir, dir)
    if platform.system() == 'Darwin':
        dir = glob.glob(os.path.join(dir, 'CMake*.app', 'Contents'))[0]
    cmake_path = os.path.join(dir, 'bin', 'cmake')
    if install_dir != '.':
        add_to_path(cmake_path)
    return cmake_path
Пример #39
0
def main():
    args = [i.lower() for i in sys.argv]

    if 'help' in args or len(args) is 1:
        print_help()

    if 'download' in args:
        down = Downloader()
        down.download()
        down.preprocess()
        down.write_out(train="train.dat",test="test.dat")
    if 'tag' in args:
        t = Tagger()
        t.tag("test.dat")
        t.write_out("test_tagged.dat")
    if 'train' in args:
        m = Model()
        m.train("train.dat")
        m.write_out()
    if 'test' in args:
        m = Model("model.mdl")
        m.test("test_tagged.dat")
Пример #40
0
class Spider():
    def __init__(self):
        self.today = time.strftime("%Y-%m-%d",time.localtime(time.time()))
        self.urllogpath = "../data/url"
        os.system("mkdir -p %s" % self.urllogpath)
        self.urllog = "../data/url/" + "downloadedurl_" + self.today + ".txt"
        self.subpagepath = "../data/subpagepath"
        os.system("mkdir -p %s" % self.subpagepath)
        
        self.baseurl = BASEURLS 
        self.suburl = {}
        self.downloader = Downloader()
        self.html2db = Html2db()

    def get_safe_utf8(self,s):
        if isinstance(s,str):
            return s
        else:
            return s.encode('utf-8','ignore')
    
    def detect_html(self,html):
        if not html:return None
        try:
            return html.decode('utf-8')
        except:
            return html.decode('gbk','ignore')
    
    def normal_url(self,url):
        u = urlparse(url)
        if u.fragment:
            return url[:-(len(u.fragment) + 1)]
        return url
    
    def link_parse(self,html,base):
        if not html or not base: return urls
        soup = BeautifulSoup(html)
        for li in soup.findAll('li'):
            try:
                li.contents[0].contents[0]
            except:
                continue
            title = li.contents[0].contents[0]
            #title = self.get_safe_utf8(title)
            href = li.contents[0]["href"]
            time = li.contents[1].strip()
            time = time.replace(u')',"")
            time = time.replace(u'(',"")
            #title = self.cleanHtmlTag(self.get_safe_utf8(title))
            if not href:continue
            if href in self.suburl.keys():continue
            href = self.normal_url(self.get_safe_utf8(urljoin(base, self.get_safe_utf8(href))))
            #self.suburl[href] = (title,time)
            if time == self.today:
                self.suburl[href] = (title,time)
            #print title 
            #print href
            #print time 
        return True

    def cleanHtmlAgain(self,value):
        regex1 = "&lt;[\s\S]*?&gt;"
        value = re.subn(regex1,"",value,re.M)
        return value[0]

    def cleanHtmlTag(self,html):
        html = html.strip()
        html = html.strip("\n")
        result = []
        parser = HTMLParser()
        parser.handle_data = result.append
        parser.feed(html)
        parser.close()
        res = ''.join(result)
        res = self.cleanHtmlAgain(res)
        return res

    def getSubUrl(self,baseurl):
        tmp = ""
        maxturnpage = 5
        regex = "\/[a-zA-Z0-9]+_[a-zA-Z0-9]+\.htm$"
        for i in range(1,maxturnpage):
            if(re.search(regex,baseurl)):
                regextmp = "\.htm$"
                tmp = re.sub(regextmp,"_" + str(i) + ".htm",baseurl)
            else:
                regexdel = "_\d?\.htm$"
                urltmp = re.sub(regexdel,"_" + str(i) + ".htm",baseurl)
                baseurl = urltmp
            html, redirect, code = self.downloader.fetch(self.get_safe_utf8(baseurl))
            if code == 200:
                html = self.detect_html(html)
                self.link_parse(html,redirect)
                print 'baseurl down succeed : %s' % baseurl
            baseurl = tmp
        return True

    def deleteDownloadedUrl(self):
        print "There are %s urls need to download!" % len(self.suburl.keys())
        if os.path.isfile(self.urllog):
            logfile = open(self.urllog)
            if logfile:
                for line in logfile.readlines():
                    line = line.strip()
                    if line in self.suburl.keys():
                        del self.suburl[line]
            else:
                print("Could not open the logfile : %s",self.urllog)
        else:
            print ("the logfile : " + self.urllog + " is not exists this time !")
        print "There are %s urls  REALLY need to download!" % len(self.suburl.keys())
        
    def downloadPages(self,enChannel,chChannel):
        enChannelpath = self.subpagepath + "/" + enChannel
        os.system("mkdir -p %s" % enChannelpath)
        num = 0
        for suburl in self.suburl.keys():
            title = self.suburl[suburl][0]
            pubtime = self.suburl[suburl][1]
            html, redirect, code = self.downloader.fetch(self.get_safe_utf8(suburl))
            if code == 200:
                print "suburl download succeed : %s" % suburl
                html = self.detect_html(html)
                subpagefile = enChannelpath + "/content_" + self.today +"_" + str(num) + ".html"
                num = num + 1
                try:
                    fileout = open(subpagefile,"w")
                    fileout.write(self.get_safe_utf8(html) + "\n")
                    fileout.close()
                except IOError, e:
                    sys.stderr.write("could not open the subpagefile : %s" + subpagefile)
                soup = BeautifulSoup(html)
                for div in soup.findAll("div",id="Zoom"):
                    content = self.cleanHtmlTag(str(div))

                inserttime = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))
                try:
                    title = self.get_safe_utf8(title)
                except:
                    title =title
                content = self.get_safe_utf8(content)
                html = self.get_safe_utf8(html)
                chChannel = self.get_safe_utf8(chChannel)
                suburl = self.get_safe_utf8(suburl)
                self.html2db.datainsert(title,content,html,chChannel,suburl,pubtime,inserttime)
            print title
            print suburl
            print pubtime
            #print content
            #print html
            print chChannel
            print inserttime 
            print "################################################################################"
Пример #41
0
import os

from download import Downloader
from uncompress import Uncompresser
from makergb import MakeRGB
from makepreview import MakePreview

if __name__ == '__main__':

    with open("creds.txt","r") as f:
        lines = f.readlines()
    username = lines[0].strip()
    password = lines[1].strip()

    # create tool instances
    dler = Downloader(username=username,password=password,DEBUG=True)
    uncomp = Uncompresser(DEBUG=True)
    rgb = MakeRGB(DEBUG=True)
    prev = MakePreview(DEBUG=True)

    # create list of known images
    #LC80130312013273LGN00
    prefix = 'LC8013031'
    #postfix = 'LGN01'
    images = [
        '2013273LGN00', # Sept 29th, 2013
    ]

    print "Processing {0} image archives ...".format(len(images))

    for image in images:
Пример #42
0
 def test_download_to_temp_dir(self):
   d = Downloader()
   with util.CaptureStdout():
     with d.download('file://' + __file__) as f:
       filename = f
   self.assertEqual(tempfile.gettempdir(), os.path.dirname(filename))
Пример #43
0
 def __init__(self, target):
     self.downloader = Downloader()
     self.projects = []
     self.projects.append(Project(
         "pugixml",
         "PUGIXML",
         None,
         False))
     self.projects.append(libgit2Project(target))
     self.projects.append(Project(
         "Ishiko/Errors",
         "ISHIKO",
         "Makefiles/$(compiler_short_name)/IshikoErrors.sln",
         False))
     self.projects.append(Project(
         "Ishiko/Collections",
         "ISHIKO",
         "Makefiles/$(compiler_short_name)/IshikoCollections.sln",
         False))
     self.projects.append(Project(
         "Ishiko/Process",
         "ISHIKO",
         "Makefiles/$(compiler_short_name)/IshikoProcess.sln",
         False))
     self.projects.append(Project(
         "DiplodocusDB/Core",
         "DIPLODOCUSDB",
         "Makefiles/$(compiler_short_name)/DiplodocusDBCore.sln",
         False))
     self.projects.append(Project(
         "DiplodocusDB/TreeDB/Core",
         "DIPLODOCUSDB",
         "Makefiles/$(compiler_short_name)/DiplodocusTreeDBCore.sln",
         False))
     self.projects.append(Project(
         "DiplodocusDB/TreeDB/XMLTreeDB",
         "DIPLODOCUSDB",
         "Makefiles/$(compiler_short_name)/DiplodocusXMLTreeDB.sln",
         False))
     self.projects.append(Project(
         "CodeSmithyIDE/CodeSmithy/Core",
         "CODESMITHY",
         "Makefiles/$(compiler_short_name)/CodeSmithyCore.sln",
         False))
     self.projects.append(Project(
         "CodeSmithyIDE/CodeSmithy/Make",
         "CODESMITHY",
         "Makefiles/$(compiler_short_name)/CodeSmithyMake.sln",
         False))
     self.projects.append(Project(
         "Ishiko/TestFramework/Core",
         "ISHIKO",
         "Makefiles/$(compiler_short_name)/IshikoTestFrameworkCore.sln",
         True))
     self.projects.append(Project(
         "Ishiko/WindowsRegistry",
         "ISHIKO",
         "Makefiles/$(compiler_short_name)/IshikoWindowsRegistry.sln",
         True))
     self.projects.append(Project(
         "Ishiko/FileTypes",
         "ISHIKO",
         "Makefiles/$(compiler_short_name)/IshikoFileTypes.sln",
         True))
     self.projects.append(Project(
         "CodeSmithyIDE/CodeSmithy/UICore",
         "CODESMITHY",
         "Makefiles/$(compiler_short_name)/CodeSmithyUICore.sln",
         True))
     self.projects.append(wxWidgetsProject())
     self.projects.append(Project(
         "CodeSmithyIDE/CodeSmithy/UIElements",
         "CODESMITHY",
         "Makefiles/$(compiler_short_name)/CodeSmithyUIElements.sln",
         True))
     self.projects.append(Project(
         "CodeSmithyIDE/CodeSmithy/UIImplementation",
         "CODESMITHY",
         "Makefiles/$(compiler_short_name)/CodeSmithyUIImplementation.sln",
         True))
     self.projects.append(Project(
         "CodeSmithyIDE/CodeSmithy/UI",
         "CODESMITHY",
         "Makefiles/$(compiler_short_name)/CodeSmithy.sln",
         True))
     self.projects.append(Project(
         "CodeSmithyIDE/CodeSmithy/Tests/Core",
         "CODESMITHY",
         "Makefiles/$(compiler_short_name)/CodeSmithyCoreTests.sln",
         True))
     self.projects.append(Project(
         "CodeSmithyIDE/CodeSmithy/Tests/Make",
         "CODESMITHY",
         "Makefiles/$(compiler_short_name)/CodeSmithyMakeTests.sln",
         True))
     self.projects.append(Project(
         "CodeSmithyIDE/CodeSmithy/Tests/UICore",
         "CODESMITHY",
         "Makefiles/$(compiler_short_name)/CodeSmithyUICoreTests.sln",
         True))
     self.tests = []
     self.tests.append(Test("CodeSmithyIDE/CodeSmithy/Tests/Core",
                            "CodeSmithyCoreTests.exe"))
     self._init_downloader()
Пример #44
0
class Projects:
    def __init__(self, target):
        self.downloader = Downloader()
        self.projects = []
        self.projects.append(Project(
            "pugixml",
            "PUGIXML",
            None,
            False))
        self.projects.append(libgit2Project(target))
        self.projects.append(Project(
            "Ishiko/Errors",
            "ISHIKO",
            "Makefiles/$(compiler_short_name)/IshikoErrors.sln",
            False))
        self.projects.append(Project(
            "Ishiko/Collections",
            "ISHIKO",
            "Makefiles/$(compiler_short_name)/IshikoCollections.sln",
            False))
        self.projects.append(Project(
            "Ishiko/Process",
            "ISHIKO",
            "Makefiles/$(compiler_short_name)/IshikoProcess.sln",
            False))
        self.projects.append(Project(
            "DiplodocusDB/Core",
            "DIPLODOCUSDB",
            "Makefiles/$(compiler_short_name)/DiplodocusDBCore.sln",
            False))
        self.projects.append(Project(
            "DiplodocusDB/TreeDB/Core",
            "DIPLODOCUSDB",
            "Makefiles/$(compiler_short_name)/DiplodocusTreeDBCore.sln",
            False))
        self.projects.append(Project(
            "DiplodocusDB/TreeDB/XMLTreeDB",
            "DIPLODOCUSDB",
            "Makefiles/$(compiler_short_name)/DiplodocusXMLTreeDB.sln",
            False))
        self.projects.append(Project(
            "CodeSmithyIDE/CodeSmithy/Core",
            "CODESMITHY",
            "Makefiles/$(compiler_short_name)/CodeSmithyCore.sln",
            False))
        self.projects.append(Project(
            "CodeSmithyIDE/CodeSmithy/Make",
            "CODESMITHY",
            "Makefiles/$(compiler_short_name)/CodeSmithyMake.sln",
            False))
        self.projects.append(Project(
            "Ishiko/TestFramework/Core",
            "ISHIKO",
            "Makefiles/$(compiler_short_name)/IshikoTestFrameworkCore.sln",
            True))
        self.projects.append(Project(
            "Ishiko/WindowsRegistry",
            "ISHIKO",
            "Makefiles/$(compiler_short_name)/IshikoWindowsRegistry.sln",
            True))
        self.projects.append(Project(
            "Ishiko/FileTypes",
            "ISHIKO",
            "Makefiles/$(compiler_short_name)/IshikoFileTypes.sln",
            True))
        self.projects.append(Project(
            "CodeSmithyIDE/CodeSmithy/UICore",
            "CODESMITHY",
            "Makefiles/$(compiler_short_name)/CodeSmithyUICore.sln",
            True))
        self.projects.append(wxWidgetsProject())
        self.projects.append(Project(
            "CodeSmithyIDE/CodeSmithy/UIElements",
            "CODESMITHY",
            "Makefiles/$(compiler_short_name)/CodeSmithyUIElements.sln",
            True))
        self.projects.append(Project(
            "CodeSmithyIDE/CodeSmithy/UIImplementation",
            "CODESMITHY",
            "Makefiles/$(compiler_short_name)/CodeSmithyUIImplementation.sln",
            True))
        self.projects.append(Project(
            "CodeSmithyIDE/CodeSmithy/UI",
            "CODESMITHY",
            "Makefiles/$(compiler_short_name)/CodeSmithy.sln",
            True))
        self.projects.append(Project(
            "CodeSmithyIDE/CodeSmithy/Tests/Core",
            "CODESMITHY",
            "Makefiles/$(compiler_short_name)/CodeSmithyCoreTests.sln",
            True))
        self.projects.append(Project(
            "CodeSmithyIDE/CodeSmithy/Tests/Make",
            "CODESMITHY",
            "Makefiles/$(compiler_short_name)/CodeSmithyMakeTests.sln",
            True))
        self.projects.append(Project(
            "CodeSmithyIDE/CodeSmithy/Tests/UICore",
            "CODESMITHY",
            "Makefiles/$(compiler_short_name)/CodeSmithyUICoreTests.sln",
            True))
        self.tests = []
        self.tests.append(Test("CodeSmithyIDE/CodeSmithy/Tests/Core",
                               "CodeSmithyCoreTests.exe"))
        self._init_downloader()

    def get(self, name):
        for project in self.projects:
            if project.name == name:
                return project
        return None

    def set_environment_variables(self, output):
        print("")
        output.print_step_title("Setting environment variables")
        env = {}
        for project in self.projects:
            value = os.getcwd() + "/Build/" + project.name.split("/")[0]
            if project.env_var in env:
                old_value = env[project.env_var]
                if (old_value != value):
                    exception_text = "Conflicting values for " + \
                        "environment variable " + project.env_var + " (" + \
                        value + " vs " + old_value + ")"
                    raise RuntimeError(exception_text)
            else:
                env[project.env_var] = value
        for var_name in env:
            print("    " + var_name + ": " + env[var_name])
            os.environ[var_name] = env[var_name]
        output.next_step()

    def download(self):
        self.downloader.download()

    def build(self, build_tools, build_configuration,
              input, state, output):
        # For now only bypass pugixml, libgit2 and wxWidgets because they
        # are independent from the rest. More complex logic is required to
        # handle the other projects.
        # Unless we have built all project succesfully.
        for project in self.projects:
            if state.build_complete:
                project.built = True
            elif project.name in ["libgit2", "pugixml", "wxWidgets"]:
                if project.name in state.built_projects:
                    project.built = True
        for project in self.projects:
            print("")
            output.print_step_title("Building " + project.name)
            if project.built:
                print("    Using previous execution")
            else:
                project.unzip(self.downloader)
                project.build(build_tools, build_configuration,
                              input, output)
            state.set_built_project(project.name)
            output.next_step()
        state.set_build_complete()

    def test(self, compiler, architecture_dir_name, input):
        for test in self.tests:
            # TODO
            executable_path = "Build/" + test.project_name + \
                              "/Makefiles/VC15/x64/Debug/" + test.executable
            try:
                subprocess.check_call([executable_path])
            except subprocess.CalledProcessError:
                launchIDE = input.query("    Tests failed. Do you you want to"
                                        " launch the IDE?", ["y", "n"], "n")
                if launchIDE == "y":
                    self.get(test.project_name).launch(compiler,
                                                       architecture_dir_name)
                raise RuntimeError(test.project_name + " tests failed.")

    def _init_downloader(self):
        for project in self.projects:
            project_downloader = project.create_downloader()
            self.downloader.merge(project_downloader)
Пример #45
0
 def test_download(self):
   d = Downloader()
   with util.CaptureStdout():
     with d.download('file://' + __file__) as f:
       self.assertEqual(readfile(__file__), readfile(f))
Пример #46
0
class Spider(threading.Thread):
    def __init__(self, master=True):
        threading.Thread.__init__(self)
        self.pagestore = PageStore()
        
        self.downloader = Downloader();
        self.connection = Connection(MONGODB_HOST, MONGODB_PORT)
        db = self.connection.download
        if master:
            db.drop_collection('downurl')
            for f, tb in ((SAVE_URL_RE_BLACK, 'save_url_black'),
                (SAVE_URL_RE_WHITE, 'save_url_white'),
                (DOWN_URL_RE_BLACK, 'down_url_black'),
                (DOWN_URL_RE_WHITE, 'down_url_white')):
                if os.path.exists(f):
                    db.drop_collection(tb)
                    logger.info('load rule:%s...' % f)
                    for s in set(open(f).readlines()):
                        s = s.strip()
                        if s:
                            db[tb].insert({'pattern': s})
                    logger.info('load rule:%s...OK' % f)
        self.downurl, self.allurl, self.watchurl, self.updateurl, self.secceedurl = db.downurl, db.allurl, db.watchurl, db.updateurl, db.secceedurl
        self.save_url_black = self.load_re(db.save_url_black)
        self.save_url_white = self.load_re(db.save_url_white)
        self.down_url_black = self.load_re(db.down_url_black)
        self.down_url_white = self.load_re(db.down_url_white)
        if master:
            self.load_watch_url()
            self.load_update_url()
            self.reload_allurl()
            logger.info('allurl:%d' % self.allurl.find().count())
            logger.info('secceedurl:%d' % self.secceedurl.find().count())
            logger.info('updateurl:%d' % self.updateurl.find().count())
            logger.info('watchurl:%d' % self.watchurl.find().count())
            logger.info('downurl:%d' % self.downurl.find().count())
        
    def load_re(self, tb):
        s = set([r['pattern'] for r in tb.find()])
        return [re.compile(s) for r in s]
    def get_safe_utf8(self, s):
        if isinstance(s, str):
            return s
        else:
            return s.encode('utf-8', 'ignore')
    def getmd5(self, s):
        m = md5.new()
        m.update(self.get_safe_utf8(s))
        return m.hexdigest()

    def get_one_task(self, tb):
        row = tb.find_and_modify(remove=True)
        if not row:return None
        row = self.allurl.find_one(row)
        return row['url'] if row else None

    def add_one_task(self, url, tb):
        s = url.lower()
        if s.startswith('http://') or s.startswith('https://'):
            k = self.getmd5(s)
            self.allurl.insert({'url': url, '_id':k})
            tb.insert({'_id': k})

    def load_watch_url(self):
        if not os.path.exists(WATCH_URL_FILE):
            return
        logger.info('load watch urls...')
        with open(WATCH_URL_FILE) as f:
            while True:
                url = f.readline()
                if not url:break
                self.add_one_task(url.strip(), self.watchurl)
        logger.info('load watch urls...%d' % self.watchurl.count())
    
    def normal_url(self, url):
        u = urlparse(url)
        if u.fragment:
            return url[:-(len(u.fragment) + 1)]
        return url
    
    def load_update_url(self):
        if not os.path.exists(UPDATE_URL_FILE):
            return
        logger.info('load update urls...')
        with open(UPDATE_URL_FILE) as f:
            while True:
                url = f.readline()
                if not url:break
                self.add_one_task(url.strip(), self.updateurl)
        logger.info('load update urls...%d' % self.updateurl.count())
    
    def check_url(self, url, black, white):
        for p in black:
            if p.search(url):
                return False
        if not white:
            return True
        for p in white:
            if p.search(url):
                return True
        return False
    
    def check_add_new_task(self, url):
        s = url.lower()
        #error url
        if not s.startswith('http://') and not s.startswith('https://'):
            return False
        #don't save url
        if not self.check_url(url, self.save_url_black, self.save_url_white):
            return False
        k = self.getmd5(s)
        #already save
        if self.allurl.find({'_id':k}).count():
            return False
        self.allurl.insert({'url': url, '_id':k})
        
        #dont't down
        if not self.check_url(url, self.down_url_black, self.down_url_white):
            return False
        
        #already down succeed
        if self.secceedurl.find({'_id':k}).count():
            return False
        self.downurl.insert({'_id': k})
        return True
    
    def reload_allurl(self):
        logger.info('reload all url...')
        for row in self.allurl.find():
            k, url = row['_id'], row['url']
            if not self.check_url(url, self.down_url_black, self.down_url_white):
                continue
            if self.secceedurl.find({'_id':k}).count():
                continue
            self.downurl.insert({'_id':k})
        logger.info('reload all url...%d ' % self.downurl.find().count())
    
    def detect_html(self, html):
        if not html:return None
        try:
            return html.decode('utf-8')
        except:
            return html.decode('gbk', 'ignore')
    
    def process_url(self, url):
        html, redirect, code = self.downloader.fetch(self.get_safe_utf8(url))
        if code == 200:
            html = self.detect_html(html)
            for href in self.link_parse(html, redirect):
                try:
                    self.check_add_new_task(href)
                except Exception as e:
                    logger.exception('%s,%s:%s' % (type(href), href, e.message))
            for k in set([self.getmd5(url.lower()), self.getmd5(redirect.lower())]):
                self.secceedurl.insert({'_id': k})
            if html:
                self.pagestore.succeed(url, html)
                return True
        return False
    
    def link_parse(self, html, base):
        urls = set()
        if not html or not base:return urls
        soup = BeautifulSoup(html)
        for a in soup.findAll('a'):
            href = a.get('href')
            if not href:continue
            if href in urls:continue
            href = self.normal_url(self.get_safe_utf8(urljoin(base, self.get_safe_utf8(href))))
            urls.add(href)
        return urls
    
    def get_url_block(self):
        while True:
            for tb in (self.watchurl, self.downurl, self.updateurl):
                url = self.get_one_task(tb)
                if url:return url
            logger.info('no any task')
            time.sleep(1)
        
    def proce_one_url(self):
        url = self.get_url_block()
        logger.info('down:%s' % url)
        ret = False
        try:
            ret = self.process_url(url)
        except Exception as e:
            logger.exception('url:%s %s' % (url, e.message))
        if not ret:
            self.pagestore.failed(url)

    def run(self):
        while True:
            try:
                while True:
                    self.proce_one_url()
            except Exception,e:
                logger.exception(e.message)
                time.sleep(1)