def __init__(self, num_thread, segment_size, overwrite=False, spider=Crawler()): self.files = [] self.pool = ThreadPool(num_thread) self.overwrite = overwrite self.spider = spider self.segment_size = segment_size
def __init__(self, url, path, overwrite=False, spider=Crawler()): self.url = url self.path = path self.tmp_path = self.path + '.t' self.name = os.path.split(self.path)[-1] self.overwrite = overwrite self.spider = spider self._status = INITIALIZED self.total = 0 self.size = 0
def __init__(self, url, path, segment_size=10 * 1024 * 1024, overwrite=False, spider=Crawler()): self.url = url self.path = path self.name = os.path.split(self.path)[-1] self.overwrite = overwrite self.spider = spider self.segment_size = segment_size self._status = INITIALIZED self.segmentable = False self.total = 0 self.segments = [] self._get_head() self._segmentation()
class Getter: def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断是否达到了代理池限制 :return: """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('获取器开始执行') if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] proxies = self.crawler.get_proxies(callback) for proxy in proxies: self.redis.add(proxy)
import re import os import sys import time from urllib.parse import urlencode from bs4 import BeautifulSoup from utils.crawler import Crawler from utils.config import Config from utils.thread import ThreadPool from utils.common import Task, repair_filename, touch_dir, size_format from utils.playlist import Dpl from utils.downloader import FileManager spider = Crawler() VIDEO, PDF, RICH_TEXT = 1, 3, 4 COURSEWARE = { VIDEO: 'Video', PDF: 'PDF', RICH_TEXT: 'Rich_text' } headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36', } spider.headers.update(headers) CONFIG = Config() def login(username, password):
# -*- coding: utf-8 -*- """网易公开课""" import time from bs4 import BeautifulSoup from Crypto.Cipher import AES from moocs.utils import * from utils.crawler import Crawler name = "open_163" need_cookies = False CANDY = Crawler() CONFIG = {} FILES = {} VIDEOS = [] exports = {} __all__ = ["name", "need_cookies", "start", "exports"] def get_summary(url): """从课程主页面获取信息""" res = CANDY.get(url).text soup = BeautifulSoup(res, 'html.parser') links = [] if re.match(r'https?://open.163.com/special/', url): # 从课程主页解析各课程链接 names = soup.find_all('div', class_='g-container')[1] organization = names.find('a').string.strip()
def __init__(self): self.redis = RedisClient() self.crawler = Crawler()
import json import re import os from bs4 import BeautifulSoup from utils.crawler import Crawler from utils.config import Config from utils.db import SQLite, BigintField, StringField, DoubleField, Model from utils.filer import touch_dir spider = Crawler() CONFIG = Config('jd_spider').conf GLOBAL = Config('jd_spider').glob GLOBAL['data_dir'] = touch_dir(CONFIG['data_dir']) class AirConditioning(Model): skuid = BigintField('skuid', primary_key=True, not_null=True) brand = StringField('brand') kind = StringField('kind') horsepower = StringField('horsepower') mode = StringField('mode') EEI = BigintField('EEI') EER = DoubleField('EER') rfc = BigintField('rfc') rfp = BigintField('rfp') noise = BigintField('noise') price = BigintField('price') vip_price = BigintField('vip_price')
import os import sys import time from urllib.parse import urlencode from bs4 import BeautifulSoup from utils.crawler import Crawler from utils.config import Config from utils.thread import ThreadPool from utils.common import Task, repair_filename, touch_dir, size_format from utils.playlist import Dpl from utils.downloader import FileManager from utils.ffmpeg import FFmpeg spider = Crawler() spider.trust_env = False VIDEO, PDF, RICH_TEXT = 1, 3, 4 COURSEWARE = {VIDEO: "Video", PDF: "PDF", RICH_TEXT: "Rich_text"} headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36", } spider.headers.update(headers) CONFIG = Config() def login(username, password): """ 登录获取 token """ pd = hashlib.md5()
import json import os import time from urllib import parse from utils.common import store_cookies from utils.crawler import Crawler spider = Crawler() def get_index(words, start_date, end_date): """ 获取在某个时间范围内的指数信息 """ wordlist = "" for n in range(len(words)): wordlist += '&wordlist%5B{}%5D={}'.format(n, words[n]) url = 'http://index.baidu.com/Interface/Newwordgraph/getIndex?region=0&startdate={}&enddate={}{}'\ .format(start_date, end_date, wordlist) res = spider.get(url) return res.json() def decrypto(origin, key): """ 解密指数信息 """ s = '' for c in origin: if c: s += key[key.index(c) + len(key) // 2] data = [] for i in s.split(','): data.append(int(i))
import json import hashlib import re import os import sys from bs4 import BeautifulSoup from utils.crawler import Crawler from utils.config import Config from utils.filer import repair_filename, touch_dir, Dpl from utils.thread import ThreadPool from utils.async_lib.utils import Task spider = Crawler() VIDEO, PDF, RICH_TEXT = 1, 3, 4 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36', } srt_types = ["zh-cn", "en"] spider.headers.update(headers) CONFIG = Config() def login(username, password): """ 登录获取 token """ pd = hashlib.md5() pd.update(password.encode('utf-8')) passwd = pd.hexdigest()