Пример #1
0
def install_ampl(filename, **kwargs):
    if installed('ampl'):
        return
    dir = filename.replace('.tgz', '')
    url = 'http://ampl.com/demo/' + filename
    install_dir = kwargs.get('install_dir', opt_dir)
    with Downloader(kwargs.get('download_dir', '.')).download(url) as f:
        with closing(tarfile.open(f, 'r:gz')) as archive:
            archive.extractall(install_dir)
    add_to_path(os.path.join(install_dir, dir, 'ampl'))
    add_to_path(os.path.join(install_dir, dir, 'ampl.lic'))
Пример #2
0
def search(keyword):
    D = Downloader()
    url = 'https://www.google.com/search?q=' + urllib.quote_plus(keyword)
    html = D(url)
    tree = lxml.html.fromstring(html)
    links = []
    for result in tree.cssselect('h3.r a'):
        link = result.get('href')
        qs = urlparse.urlparse(link).query
        links.extend(urlparse.parse_qs(qs).get('q', []))
    return links
Пример #3
0
def install_maven(**kwargs):
    if installed('mvn'):
        return
    # 3.2.5 is the most recent version of Maven compatible with Java 6.
    dir = 'apache-maven-3.2.5'
    url = 'http://mirrors.sonic.net/apache/maven/maven-3/3.2.5/binaries/{0}-bin.tar.gz'.format(
        dir)
    install_dir = kwargs.get('install_dir', opt_dir)
    with Downloader(kwargs.get('download_dir', '.')).download(url) as f:
        with closing(tarfile.open(f, 'r:gz')) as archive:
            archive.extractall(install_dir)
    add_to_path(os.path.join(install_dir, dir, 'bin', 'mvn'))
Пример #4
0
def threaded_crawler(seed_url,
                     delay=5,
                     user_agent='wswp',
                     proxies=None,
                     num_retries=1,
                     max_threads=10,
                     timeout=60,
                     scrape_callback=None,
                     cache=None):
    """Crawl this website in multiple threads
    """
    #crawl_queue = Queue.deque([seed_url])
    crawl_queue = [seed_url]
    seen = set([seed_url])
    D = Downloader(cache=cache,
                   delay=delay,
                   user_agent=user_agent,
                   proxies=proxies,
                   num_retries=num_retries,
                   timeout=timeout)

    def process_queue():
        while True:
            try:
                url = crawl_queue.pop()
            except IndexError:
                break
            else:
                html = D(url)
                if scrape_callback:
                    try:
                        links = scrape_callback(url, html) or []
                    except Exception as e:
                        print 'Error in callback for: {}: {}'.format(url, e)
                    else:
                        for link in links:
                            link = normalize(seed_url, link)
                            print link
                            if link not in seen:
                                seen.add(link)
                                crawl_queue.append(link)

    threads = []
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue:
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(True)
            thread.start()
            threads.append(thread)
        time.sleep(SLEEP_TIME)
Пример #5
0
def link_crawler(seed_url,
                 link_regex=None,
                 delay=5,
                 max_depth=-1,
                 max_urls=-1,
                 headers=None,
                 user_agent='wswp',
                 proxies=None,
                 num_retries=1,
                 scrape_callback=None,
                 cache=None):
    crawl_queue = Queue.deque([seed_url])
    seen = {seed_url: 0}
    num_urls = 0
    # rp = get_robots(seed_url)
    D = Downloader(delay=delay,
                   user_agent=user_agent,
                   proxies=proxies,
                   num_retries=num_retries,
                   cache=cache)
    thrtl = throttle.Throttle(delay)
    headers = headers or {}
    if user_agent:
        headers['User-agent'] = user_agent

    while crawl_queue:
        url = crawl_queue.pop()
        if True:  # rp.can_fetch(user_agent, url):
            print url
            thrtl.wait(url)
            html = D(url)
            links = []
            if scrape_callback:
                links.extend(scrape_callback(url, html) or [])
            depth = seen[url]
            if depth != max_depth:
                if link_regex:
                    links.extend(link for link in get_links(html)
                                 if re.match(link_regex, link))
                for link in links:
                    link = normalize(seed_url, link)

                    if link not in seen:
                        seen[link] = depth + 1
                        # if same_domain(seed_url, link):
                        crawl_queue.append(link)

            num_urls += 1
            if num_urls == max_urls:
                break
        else:
            print 'Blocked by robots.txt:', url
Пример #6
0
def test_rule(url, regexp=''):
    download = Downloader()
    html1 = download.get(url)
    #print html1
    text1 = process_selector(selector, html1.text)
    md51 = md5(text1.encode('utf-8'))
    html2 = download.get(url)
    text2 = process_selector(selector, html2.text)
    md52 = md5(text2.encode('utf-8'))
    if md51 == md52:
        print 'md5 is same'
    else:
        print md51, md52
Пример #7
0
def download(url='', title='', artist='', gender='', album=''):
    cleanMp3s()
    url = request.form['url']
    title = request.form['title']
    artist = request.form['artist']
    gender = request.form['gender']
    album = request.form['album']
    downloader = Downloader(url, title, artist, gender, album)
    try:
        path = downloader.download()
    except IOError as e:
        return str(e)
    return send_from_directory(os.path.abspath('.'), path, as_attachment=True)
Пример #8
0
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60):
    """Crawl this website in multiple threads
    """
    # the queue of URL's that still need to be crawled
    #crawl_queue = Queue.deque([seed_url])
    crawl_queue = [seed_url]
    # the URL's that have been seen 
    seen = set([seed_url])
    D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout)

    def process_queue():
        while True:
            try:
                url = crawl_queue.pop()
            except IndexError:
                # crawl queue is empty
                break
            else:
                html = D(url)
                if scrape_callback:
                    try:
                        links = scrape_callback(url, html) or []
                    except Exception as e:
                        print 'Error in callback for: {}: {}'.format(url, e)
                    else:
                        for link in links:
                            link = normalize(seed_url, link)
                            # check whether already crawled this link
                            if link not in seen:
                                seen.add(link)
                                # add this new link to queue
                                crawl_queue.append(link)


    # wait for all download threads to finish
    threads = []
    while threads or crawl_queue:
        # the crawl is still active
        for thread in threads:
            if not thread.is_alive():
                # remove the stopped threads
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue:
            # can start some more threads
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(True) # set daemon so main thread can exit when receives ctrl-c
            thread.start()
            threads.append(thread)
        # all threads have been processed
        # sleep temporarily so CPU can focus execution on other threads
        time.sleep(SLEEP_TIME)
Пример #9
0
def get_data(url):
	print(url)
	down = Downloader(headers=headers_home)
	path = 'cache/hz.meituan.com/index.html'
	if os.path.exists(path):
		os.remove(path)
	uuid = get_uuid('http://hz.meituan.com/', down)
	if not uuid:
		return
	data = {}
	type_ = 'c' + url.split('/c')[-1][:-1]
	print(type_)
	cateId = type_[1:]
	areaId = '-1'
	# print(cateId, areaId)
	data['FIRST_LEVEL_DIRECTORY'] = '生活服务'
	data['SECOND_LEVEL_DIRECTORY'] = class_[type_]
	down.headers = headers_get
	index = 0
	while True:
		index = index + 1
		down.headers['Referer'] = url + '/' + 'pn' + str(index) + '/'
		url_get = 'http://apimobile.meituan.com/group/v4/poi/pcsearch/50?uuid='+uuid+'&userid=-1&limit=32&offset='+str((index-1)*32)+'&cateId='+cateId+'&areaId='+areaId
		html = down(url_get)
		try:
			search_result = json.loads(html)['data']['searchResult']
		except Exception as e:
			print('in get_data error ',e)
		if search_result == []:
			print('search_result is None')
			break
		# print(search_result)
		for one_item in search_result:
			data['SHOP_ID'] = one_item['id']
			data['SHOP_PHOTOS'] = one_item['imageUrl']
			data['SHOP_NAME'] = one_item['title']
			data['ADDRESS'] = one_item['address']
			data['RANK_STARS'] = one_item['avgscore']
			data['AVG_PRICE_TITLE'] = one_item['avgprice']
			tuangou = one_item['deals']
			if not tuangou:
				data['GROUP_BUYING_NUMBER'] = 0
				data['GROUP_BUYING'] = None
			else:
				data['GROUP_BUYING_NUMBER'] = len(tuangou)
				taocan = ''
				for one in tuangou:
					taocan = taocan + '价格' + str(one['price']) + ' 门市价' + str(one['value']) + ' 出售' + str(one['sales'])
				data['GROUP_BUYING'] = taocan
			db.insert_into(data)
Пример #10
0
def search(keyword):
    """
    Google search for a keyword.
    """
    D = Downloader()
    url = "https://www.google.com/search?q=" + ul.quote_plus(keyword)
    html = D(url)
    tree = lxml.html.fromstring(html)
    links = []
    for result in tree.cssselect("h3.r a"):
        link = result.get("href")
        qs = ulp.urlparse(link).query
        links.extend(ulp.parse_qs(qs).get("q", []))
    return links
Пример #11
0
def downloadlink(url='', title='', artist='', gender='', album=''):
    cleanMp3s()
    url = request.form['url']
    title = request.form['title']
    artist = request.form['artist']
    gender = request.form['gender']
    album = request.form['album']
    downloader = Downloader(url, title, artist, gender, album)
    path = downloader.download()
    dir = 'files/'
    if not os.path.exists(dir):
            os.makedirs(dir)
    newpath = dir + path
    os.rename(path, newpath)
    return '<a href="/' + newpath + '">' + newpath + '</a>'
Пример #12
0
def down_info_by_id(one_id=None):
	if not one_id:
		return None
	data = {}
	down = Downloader(headers=headers_home)
	id = one_id['SHOP_ID']
	sql = 'update crawler.mt_meishi set LABEL_IS_CCRAWLED = 2 where SHOP_ID = ' + id
	db.update_data(sql)
	url = HOMEURL + id + '/'
	uuid, data['TELEPHONE'], data['BUSINESS_TIME'] = get_uuid_phone_openTime_wifi(url, down)
	if uuid:
		data['REVIEW_COUNT'], data['NETIZEN_EVALUTION'] = get_review(uuid, id, url, down)
		if data['NETIZEN_EVALUTION'] == None:
			return
		limit = ''' '''
		for key, value in data.items():
			if data[key] != None:
				if type(data[key]) == int:
					limit = limit + str(key) + "=" + str(data[key]) + ","
				else:
					limit = limit + str(key) + "=" + "'" + data[key] + "'" + ","
		limit = limit[:-1]
		sql = 'update crawler.mt_meishi set ' + limit + ' where SHOP_ID = ' + id
		db.update_data(sql)
	else:
		print('uuid is None')
		return
	limit = ''
	sql = ''
	now_time = datetime.now()
	now_time = str(now_time)
	now_time = now_time.split('.')[0]
	data['UPDATE_TIME'] = now_time
	data['LABEL_IS_CCRAWLED'] = 1
	try:
		for key, value in data.items():
			if data[key] != None:
				if type(data[key]) == int:
					limit = limit + str(key) + "=" + str(data[key]) + ","
				else:
					limit = limit + str(key) + "=" + "'" + data[key] + "'" + ","
		limit = limit[:-1]
		sql = 'update crawler.mt_meishi set ' + limit + ' where SHOP_ID = ' + id
		db.update_data(sql)
	except Exception as e:
		print(e)
		pass
Пример #13
0
def main(reparse=False):
    """Main entry point for this ETL process.  Downloads, updates db,
    stores the nightly data.

    This is the binary to run from a cron job.

    """

    os.chdir(os.path.dirname(__file__))
    logger = log.logger()
    logger.info('Starting ETL of FBO Nightly data.')

    # Figure out where we put data
    datadir = get_datadir()
    dbdir = get_dbdir()
    if not os.path.exists(os.path.join(dbdir, "sqlite3")):
        os.makedirs(os.path.join(dbdir, "sqlite3"))

    # Get a database connection, create db if needed
    db = model.FBO(
        "development",
        db_conf_file=os.path.join(
            dbdir,
            "dbconf.yml"))

    # Make sure the db schema is up to date, create tables, etc.
    db.migrate()

    assert os.path.exists(datadir)

    # Download raw data files
    dloader = Downloader(datadir, db, 'nightly')
    dloader.download(fname_urls, True)

    # Do our ETL
    nights = Nightlies(db)
    nights.etl_from_dir(reparse=reparse)

    # Close the db connection
    db.close()

    info('Finished ETL of FBO data.')
Пример #14
0
def install_cmake(package, **kwargs):
    if kwargs.get('check_installed', True) and installed('cmake'):
        return
    dir, version, minor = re.match(r'(cmake-(\d+\.\d+)\.(\d+).*-[^\.]+)\..*',
                                   package).groups()
    # extractall overwrites existing files, so no need to prepare the
    # destination.
    url = 'https://cmake.org/files/v{0}/{1}'.format(version, package)
    install_dir = kwargs.get('install_dir', opt_dir)
    with Downloader(kwargs.get('download_dir', '.')).download(url) as f:
        iszip = package.endswith('zip')
        with zipfile.ZipFile(f) if iszip \
             else closing(tarfile.open(f, 'r:gz')) as archive:
            archive.extractall(install_dir)
    dir = os.path.join(install_dir, dir)
    if platform.system() == 'Darwin':
        dir = glob.glob(os.path.join(dir, 'CMake*.app', 'Contents'))[0]
    cmake_path = os.path.join(dir, 'bin', 'cmake')
    if install_dir != '.':
        add_to_path(cmake_path)
    return cmake_path
Пример #15
0
    def __init__(self, src_name, inst_name, index_file = None, t_start = None, t_end = None, wavl_min = None, wavl_max = None, data_dir =''):

        # Set instrument properties
        self.source = src_name
        self.instrument = inst_name

        # check the keyword parameters
        if index_file != None:   # Check if the file_list parameter is specified

            # Read index file into a list with each line as a separate element/filename
            with open(index_file) as f:
                file_list = f. readlines()

            file_list = [x.strip() for x in file_list]      # Stip off newlines, and trailing and leading whitespace

            self.import_data(file_list)     # If so, we are free to import data

        elif ((t_start != None) and (t_end != None) and (wavl_min != None) and (wavl_max != None)):    # If not, we need to grab the data from the VSO

            # Find the available files for download using the Virtual Solar Observatory
            c = vso.VSOClient() # Initialize Sunpy VSO client
            #qr = c.query(vso.vso.attrs.Time(t_start, t_end), vso.vso.attrs.Instrument(self.instrument), vso.vso.attrs.Wave(wavl_min * u.AA, wavl_max * u.AA))
            #qr = c.query(vso.vso.attrs.Time(t_start, t_end), vso.vso.attrs.Instrument(self.instrument))
            qr = c.query_legacy(tstart=t_start, tend=t_end, instrument=self.instrument, min_wave=wavl_min, max_wave=wavl_max, unit_wave='Angstrom')   # Query the VSO for files
            print(qr)   # Print the query

            # Download the files returned by the query
            dw = Downloader()   # Initialize custom downloader class
            r = c.get(qr, path = data_dir + '/{source}/{instrument}/{file}').wait()

            print(r)

            # Import the data
            # self.import_data(file_list)

        else:   # Invalid keyword combination

            print('Incorrect keyword specification')
Пример #16
0
def process(rules):
	for rule in rules:
		download = Downloader()
		html = download.get(rule.url)
		if html == None:
			logger.error('%s无法访问'%rule.corp)
			continue
		elif rule.selector:
			text = process_selector(rule,html.text)
		elif rule.types == 'github':
			rule.selector = "div.commit-group-title"
			text = process_selector(rule,html.text)
		else:
			text = html.text
		if text == None:
			continue
		hash_list = dataConfig.hash_list()
		html_md5 = md5(text.encode('utf-8')) #text编码为unicode
		if debug:
			print 'html:',text[:20]
			print 'hash_list:',hash_list
			print 'html_md5',html_md5
		
		if len(hash_list) > 0:
			if rule.corp in hash_list.keys():
				if html_md5 == hash_list[rule.corp]:
					logger.info('%s no change'%rule.corp)
				else: #如果hash改变,说明有更新,发送邮件通知
					logger.warning('%s has update'%rule.corp)
					dataConfig.update_hash(rule.corp,html_md5)
					context = '<a href={0}>{0}</a>'.format(rule.url)
					Notification(rule.message).notification(context)
			else: #如果不存在该corp,则添加该hash
				logger.info('添加新的监控app:%s'%rule.corp)
				dataConfig.add_hash(rule.corp,html_md5)
		else: #如果hash列表为空,则先初始化
			logger.info('wam init ....')
			dataConfig.add_hash(rule.corp,html_md5)
Пример #17
0
    def init_connection(self):
        try:
            self.vk_session = vk_api.VkApi(login=os.getenv("LOGIN"),
                                           password=os.getenv("PASSW"))
            try:
                self.vk_session.auth(token_only=True)
            except vk_api.AuthError as e:
                print(e)
                sys.exit(0)
            except vk_api.exceptions.Captcha as e:
                print("CAPTCHA")
                print(e.get_url())
                code = input()
                e.try_again(key=code)

            print("ID:", os.getpid())
            print("Got VK API Session")
            self.group_session = vk_api.VkApi(token=os.getenv("KEY"))
            print("Got Group Session")
            self.longpoll = VkBotLongPoll(self.group_session,
                                          os.getenv("GROUP_ID"))
            print("Got Longpoll Object")
            self.api = self.vk_session.get_api()
            print("Got API Object")
            self.group_api = self.group_session.get_api()
            print("Got Group API Object")
            self.upload = vk_api.VkUpload(self.vk_session)
            print("Got Upload Object")
            self.loader = Downloader()
            print("Got Downloader Object")
        except (requests.exceptions.ConnectionError) as e:
            print("Reinitializing session data")
            print(e)
            print("Timeout:", self.timeout)
            time.sleep(self.timeout)
            self.timeout += 1
            self.init_connection()
Пример #18
0
# 	'ci':'50',
# 	' _lxsdk_s': '1634d0b9358-04-0ab-c9c%7C%7C30'
# }

# user_agent_list = [
# 	'Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50',
# 	'Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50',
# 	'Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0',
# 	'Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.0;Trident/4.0)',
# 	'Mozilla/5.0(Macintosh;IntelMacOSX10.6;rv:2.0.1)Gecko/20100101Firefox/4.0.1',
# 	'Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1',
# 	'Opera/9.80(Macintosh;IntelMacOSX10.6.8;U;en)Presto/2.8.131Version/11.11'
# ]

db = Database()
down = Downloader(headers=headers_shop, cache=None)
sql = "SELECT SHOP_ID, SECOND_LEVEL_DIRECTORY FROM crawler.mt_meishi where LABEL_IS_CCRAWLED = 0 and FIRST_LEVEL_DIRECTORY = '休闲娱乐' limit 50;"


def update_shop_by_id(data):
    sql_hand = 'update crawler.mt_meishi set '
    sql_end = ' where SHOP_ID = ' + str(data['SHOP_ID']) + ';'
    sql_body = ''
    for key, value in data.items():
        if data[key] == None:
            continue
        elif type(data[key]) == int:
            sql_body = sql_body + key + ' = ' + str(data[key]) + ', '
        else:
            sql_body = sql_body + key + ' = ' + "'" + str(
                data[key]) + "'" + ', '
Пример #19
0
import os

from download import Downloader
from uncompress import Uncompresser
from makergb import MakeRGB
from makepreview import MakePreview

if __name__ == '__main__':

    with open("creds.txt","r") as f:
        lines = f.readlines()
    username = lines[0].strip()
    password = lines[1].strip()

    # create tool instances
    dler = Downloader(username=username,password=password,DEBUG=True)
    uncomp = Uncompresser(DEBUG=True)
    rgb = MakeRGB(DEBUG=True)
    prev = MakePreview(DEBUG=True)

    # create list of known images
    #LC80130312013273LGN00
    prefix = 'LC8012031'
    #postfix = 'LGN01'
    images = [
        '2014077LGN00',
        '2014061LGN01',
        '2014045LGN00',
        '2014029LGN00',
        '2014013LGN00',
        '2013362LGN00',
Пример #20
0
# -*-coding:utf-8-*-
# coding: utf-8
# coding: gb2312
# __author__='admin'

import urlparse
import itertools
from download import Downloader
import re

D = Downloader()


class Crawler(object):
    def ID_crawler(self, start_url, max_depth=15, max_errors=3):
        global html
        depth = 1
        num_errors = 0
        for page in itertools:
            page_url = start_url + '/%d.html' % page
            if depth != max_depth:
                depth = depth + 1
                html = D(page_url).text
                if html is None:
                    num_errors = num_errors + 1
                    if num_errors == max_errors:
                        print 'It is full of Errors.'
                        break
                else:
                    num_errors = 0
            else:
Пример #21
0
 def setUp(self):
     self.downloader = Downloader()
Пример #22
0
 def __init__(self, key):
     self.dl = Downloader(key)
     self.currSymbol = ""
     self.currData = []
Пример #23
0
import sys
from PyQt5 import QtGui, QtQuick, QtWidgets, QtCore

from download import Downloader

downloader = Downloader('http://cdimage.debian.org/debian-cd/8.4.0/amd64/iso-cd/debian-8.4.0-amd64-netinst.iso')

app = QtWidgets.QApplication(sys.argv)
view = QtQuick.QQuickView()
view.rootContext().setContextProperty('downloader', downloader)
view.setSource(QtCore.QUrl("download.qml"))
view.show()
app.exec_()
Пример #24
0
from subprocess import check_call

build = os.environ['BUILD']
cmake_command = [
    'cmake', '-DFMT_EXTRA_TESTS=ON',
    '-DCMAKE_BUILD_TYPE=' + os.environ['CONFIG']
]
build_command = [
    'msbuild', '/m:4', '/p:Config=' + os.environ['CONFIG'], 'FORMAT.sln'
]
test_command = ['msbuild', 'RUN_TESTS.vcxproj']
if build == 'mingw':
    # Install MinGW.
    mingw_url = 'http://ufpr.dl.sourceforge.net/project/mingw-w64/' + \
      'Toolchains%20targetting%20Win64/Personal%20Builds/mingw-builds/' + \
      '4.9.2/threads-win32/seh/x86_64-4.9.2-release-win32-seh-rt_v3-rev1.7z'
    with Downloader().download(mingw_url) as f:
        check_call(['7z', 'x', '-oC:\\', f])

    # Remove path to Git bin directory from $PATH because it breaks MinGW config.
    path = os.environ['PATH'].replace(r'C:\Program Files (x86)\Git\bin', '')

    os.environ[
        'PATH'] = r'C:\Program Files (x86)\MSBUILD\12.0\bin\;' + path + r';C:\mingw64\bin'
    cmake_command.append('-GMinGW Makefiles')
    build_command = ['mingw32-make', '-j4']
    test_command = ['mingw32-make', 'test']

check_call(cmake_command)
check_call(build_command)
Пример #25
0
 def test_download_to_temp_dir(self):
   d = Downloader()
   with util.CaptureStdout():
     with d.download('file://' + __file__) as f:
       filename = f
   self.assertEqual(tempfile.gettempdir(), os.path.dirname(filename))
Пример #26
0
                     ReleaseDate=g['releasetime'],
                     WorkExperience=g['workexperience'],
                     RecruitingNumbers=g['recruitnumbers'],
                     WorkPlace=g['workplace'],
                     EducationalRequirements=g['educationalrequirements'],
                     JobCategory=g['jobcategory'],
                     JobDescription=g['jobdescription'])


if __name__ == '__main__':
    page_max = 90
    pages_url = [
        "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=全国&kw=嵌入式&p=%s" %
        str(i) for i in range(page_max)
    ]
    D = Downloader()  #下载HTML
    import requests

    sqlcache = SQLCache()
    sqlqueue = SQLQueue()
    jobscache = JobsCache()
    sqlseen = Seen()
    drop(sqlqueue, sqlseen)
    jobscache.delete_all()
    jobscache = JobsCache()

    urlsx = [extract_urls(D(url)) for url in pages_url]
    for urls in urlsx:
        for url in urls:
            sqlqueue.append(url)
Пример #27
0
def download(url, cookie=None):
    return Downloader('.').download(url, cookie)
Пример #28
0
#!/usr/bin/env python

from download import Downloader
from fee import Fee
from database import Fund
import database

if __name__ == '__main__':
    dl = Downloader()
    data = dl.get_fee('0P0000YXTA')
    fee = Fee(data)
    fund = Fund(management=fee.management,
                custodial=fee.custodial,
                distribution=fee.distribution)
    database.session.add(fund)
    database.session.commit()
Пример #29
0
from __future__ import print_function
import os, re, shutil, tarfile, tempfile
from bootstrap import bootstrap
from contextlib import closing
from download import Downloader
from subprocess import call, check_call, check_output, Popen, PIPE, STDOUT

build = os.environ['BUILD']
if build == 'doc':
    returncode = 1
    travis = 'TRAVIS' in os.environ
    workdir = tempfile.mkdtemp()
    try:
        doxygen_url = 'http://ftp.stack.nl/pub/users/dimitri/doxygen-1.8.10.linux.bin.tar.gz'
        dir = os.path.dirname(os.path.realpath(__file__))
        with Downloader().download(doxygen_url) as f:
            with closing(tarfile.open(f, 'r:gz')) as archive:
                archive.extractall(dir)
        doxygen = os.path.join(dir, 'doxygen-1.8.10/bin/doxygen')
        returncode, repo_dir = __import__('build-docs').build_docs(
            workdir, doxygen)
        if returncode == 0 and os.environ['TRAVIS_BRANCH'] == 'master':
            # Push docs to GitHub pages if this is a master branch.
            if travis:
                check_call(
                    ['git', 'config', '--global', 'user.name', 'amplbot'])
                check_call([
                    'git', 'config', '--global', 'user.email', '*****@*****.**'
                ])
            check_call(['git', 'add', '--all'], cwd=repo_dir)
            if call(['git', 'diff-index', '--quiet', 'HEAD'], cwd=repo_dir):
Пример #30
0
def down_shop_name(one_class=None):
    down = Downloader(headers=request_headers)
    # http://hz.meituan.com/meishi/api/poi/getPoiList?cityName=%E6%9D%AD%E5%B7%9E&cateId=20004&page=2
    page = 0

    def get_uuid(url):
        text_uuid = down(url)
        # print(text_uuid)
        re_uuid = re.compile(r'"uuid":"(.*?)",', re.IGNORECASE)
        try:
            uuid = re_uuid.findall(text_uuid)[0]
        except Exception as e:
            print('get uuid error: ', e)
            return None
        return uuid

    while True:
        html = ''
        page += 1
        print('*****in*****down_shop_name*****')
        # print(one_class)
        # print('*****one class*****')
        type_code = one_class['DIRECTORY_CODE']
        url_uuid = 'http://hz.meituan.com/meishi/%s/pn%d/' % (type_code, page)
        print(url_uuid)
        down.headers['Referer'] = url_uuid
        down.headers['Accept'] = home_request_headers_accept
        uuid = get_uuid(url_uuid)
        if not uuid:
            break
        type_code = type_code.replace('c', '')
        url = r'http://hz.meituan.com/meishi/api/poi/getPoiList?uuid=' + uuid + r'&platform=1&partner=126&originUrl=' + url_uuid + r'&riskLevel=1&optimusCode=1&cityName=%E6%9D%AD%E5%B7%9E&' + 'cateId=%s&areaId=0&sort=&dinnerCountAttrId=&page=%d&userId=0' % (
            type_code, page)
        # print(url)
        # url = r'http://hz.meituan.com/meishi/c17/pn2/'
        down.headers['Accept'] = request_headers_accept
        html = down(url)
        # json_html = json.loads(html)
        # print(json_html['status'])
        # print(type(html))
        if len(html) < 100:
            break
        # print(html[:100])
        shop_info_list = extrace(html, type_code)
        threads = []

        def write_data_to_db():
            data = shop_info_list.pop()
            db = Database()
            db.insert_into(data)

        while shop_info_list or threads:
            for thread in threads:
                if not thread.is_alive():
                    threads.remove(thread)
            while len(threads) < 5 and shop_info_list:
                thread = threading.Thread(target=write_data_to_db)
                thread.setDaemon(True)
                thread.start()
                threads.append(thread)
        time.sleep(random.uniform(30, 60))