Пример #1
0
    def craw_ips_by_page(self, page):
        url = "https://www.xicidaili.com/nn/%s" % page
        downloader = download.Downloader()
        html_content = downloader.requests_get(url, "html")

        # 若返回 “0”, 可能是本机 ip 被封禁,尝试使用备用代理
        if html_content == "0":
            f = open("ips_copy.json", "r")
            ips = json.loads(f.read())
            for ip in ips:
                proxy_temp = {"http": "http://%s:%s" % (ip['ip'], ip['port'])}
                print("本机ip不可用, 尝试 http://%s:%s" % (ip['ip'], ip['port']))
                try:
                    res = requests.get(url, timeout=1, proxies=proxy_temp)
                    if res.status_code == 200:
                        html_content = res.content.decode("utf-8", "ignore")
                        break
                except:
                    continue

        soup = BeautifulSoup(html_content, 'html.parser')
        all_trs = soup.find("table", id="ip_list").find_all('tr')
        for tr in all_trs[1:]:
            tds = tr.find_all("td")
            ip = {
                'ip': tds[1].get_text(),
                'port': tds[2].get_text(),
                'type': tds[5].get_text()
            }
            # ip = tds[1].get_text()
            # 检查 ip 是否可用
            if self.check_ip(ip):
                self.ip_pool.append(ip)
            if len(self.ip_pool) >= self.max_ip_num:
                break
def getData(year, file, toDir):
    urls = []
    file = open(file)
    line = file.readline()
    while line:
        if year in line:
            print(line)
            urls.append(line)
        line = file.readline()
    file.close()
    # 遍历urls下载
    for url in urls:
        print("——————————————————————————————————————————————")
        download.Downloader(toDir).donwnloading(url)
Пример #3
0
 def __init__(self, root, data, threadNum):
     self.root = root
     self.threadNum = threadNum
     self.downloader = download.Downloader()
     self.headers = {
         'User-Agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20'
     }
     self.s_list = []
     self.links = []
     # self.total_count = len(data)
     # self.start_time = time.time()
     # sizex, sizey = getTerminalSize()
     # self.width = sizex
     # self.height = sizey
     for line in data:
         #print line
         self.links.append(line.strip())
     self.work()
Пример #4
0
def download_task(q):
    # Create a new downloader instance
    dl = download.Downloader()
    last_update = None
    while True:
        if last_update == hash(q):
            time.sleep(5)
            continue
        last_update = hash(q)

        # Get the next song that hasn't been downloaded
        song = q.get_next_song(only_undownloaded=True)

        if song is None:
            time.sleep(5)
            continue

        # Download the song and store the filename
        filename = dl.download(song["youtube_id"])
        # Update the database with the filename
        q.update_song(song["id"], filename=filename)
Пример #5
0
 def __init__(self, root, data, threadNum):
     self.root = root
     if not self.root:
         print 'not url'
     self.threadNum = threadNum
     self.headers = {
         'User-Agent':
         'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0'
     }
     self.task = Queue.Queue()
     self.s_list = []
     self.downloader = download.Downloader()
     self.total_count = len(data)
     self.start_time = time.time()
     sizex, sizey = getTerminalSize()
     self.width = sizex
     self.height = sizey
     for line in data:
         #print line
         self.task.put(line.strip())
     self.remaining_count = self.task.qsize()
     self.work()
Пример #6
0
def main():
    template_url = 'http://example.webscraping.com/ajax/search.json?page={}&page_size=10&search_term={}'
    countries = set()
    downloader = download.Downloader(mongo_cache.MongoCache())

    for letter in string.lowercase:
        page = 0
        while True:
            html = downloader(template_url.format(page, letter))
            print html
            try:
                ajax = json.loads(html)
            except ValueError as e:
                print e
                ajax = None
            else:
                for record in ajax['records']:
                    countries.add(record['country'])
            page += 1
            if ajax is None or page >= ajax['num_pages']:
                break

    open('countries.txt', 'w').write('\n'.join(sorted(countries)))
Пример #7
0
def direct_download(url):
    downloader = download.Downloader()
    return downloader(url)
Пример #8
0
        # Fix includes.
        dst.write(
            re.sub(r'# *include "(format.h|posix.h)"', r'#include "mp/\1"',
                   line))


def extract(archive, filenames, dest, archive_dir, **kwargs):
    dest = os.path.join(project_dir, dest)
    if kwargs.get('clean'):
        fileutil.rmtree_if_exists(dest)
        os.mkdir(dest)
    for filename in filenames:
        dest_path = os.path.join(dest, filename)
        if filename.endswith('/'):
            if not os.path.exists(dest_path):
                os.mkdir(dest_path)
            continue
        with archive.open(archive_dir + filename) as src:
            with open(dest_path, 'w') as dst:
                copyfile(src, dst)


d = download.Downloader()
with d.download(
        'https://github.com/cppformat/cppformat/archive/master.zip') as f:
    with zipfile.ZipFile(f, 'r') as zf:
        root = 'cppformat-master/'
        extract(zf, include_files, 'include/mp', root)
        extract(zf, src_files, 'src', root)
        extract(zf, test_files, 'test', root + 'test/')
def main(bucket, filter, error, no_dl):
    if not no_dl:
        dl = download.Downloader(bucket, filter)
        dl.run()

    process.Processor(error).run()