def __init__(self, name: str, url: (str, list), rule_name: str): """ 初始化一个抓取应用 :param name: 应用名 :param url: 入口链接 :param rule_name: 入口链接内容对应的解析规则 """ self.name = name get_app_config().app_name = name self.urls = [Url(i) for i in url] if isinstance(url, list) else [Url(url)] self.rule_name = rule_name self.current_job = None self.normal_thread_count = 1 self.proxy_thread_count = 1 self.config = get_app_config()
def load(cls, url: Url, retry=25): """ 下载URL链接,返回下载的内容; 先从缓存查询该链接(本地文件缓存或者ElasticSearch缓存),如果缓存存在,直接返回缓存内容 :param url: :param retry: :return: """ ok, result = False, None referer = url.referer url = url.value if referer: headers['referer'] = referer # 代理设置 app_config = get_app_config() proxies = app_config.proxy_mapping.get_url_proxy(url) logger.info('Start request url: {}, proxy: {}'.format(url, proxies)) for i in range(retry): from_t = datetime.datetime.now() logger.info('[{}] request url: {}, proxy: {}'.format(i, url, proxies)) try: dt = (datetime.datetime.now() - from_t).microseconds // 1000 # 根据配置选择是否使用会话 if app_config.use_session: session = get_current_thread_session() session.proxies = proxies # 使用会话抓取,自动处理天猫的302跳转, 中间可能有多个302跳转,所以不要设置超时时间 res = session.get(url, timeout=120) else: res = requests.get(url, headers=headers, proxies=proxies) if res.ok: logger.info('[{}] request url success, takes: {} ms, size:{}, {}'.format(i, dt, len(res.text), url)) if res.encoding == 'ISO-8859-1': result = res.content.decode('utf8') elif res.encoding is None and res.apparent_encoding == 'ISO-8859-1': result = res.content.decode('gb2312') else: result = res.text if res.encoding in ( 'gbk', 'GBK', None, 'gb2312', 'ISO-8859-1') else res.content.decode('utf8') # 根据配置检查是否是正常的返回内容,如果不是,重新抓取 if app_config.fail_conditions.test(url, result): ok = True break else: # 被反扒,等待4s 代理地址切换 time.sleep(4) else: logger.info('[{}] request url failed, takes {}ms, code:{}-{}'.format(i, dt, res.status_code, res.reason)) time.sleep(0.5) except Exception as e: logger.error('[{}] request url failed, error: {}'.format(i, e)) # import traceback # traceback.print_exc() time.sleep(0.1) return ok, result
def create_app(env: str = None): """ Factory function sits atop the application :return: Flask application """ if env is None: env = os.getenv('FLASK_ENV', 'development') app = Flask(__name__) app.config.from_object(get_app_config(env)) app.logger = structlog.get_logger() with app.app_context(): es_uri = app.config.get('ELASTICSEARCH_URL') if es_uri: # setup conn es = get_elasticsearch_client(es_uri) # create index index_name = os.getenv('ELASTICSEARCH_INDEX_NAME') es.indices.create(index=index_name, ignore=400) # store elasticsearch client into global namespace app.elasticsearch = es register_extensions(app) register_blueprints(app) register_errorhandlers(app) register_jinja_env(app) return app
def create_app(app_config=None): app = Flask(__name__) CORS(app) # setting foodie_app config app_config = app_config if app_config else get_app_config() app.config.from_object(app_config) # init db db.init_app(app) migrate.init_app(app, db) # register blueprints app.register_blueprint(api) return app
def start_app(app_name: str, urls: list, rule_name: str, process_count=3): mode = get_app_config().app_mode logger.info('Start app [{}] with mode [{}]'.format(app_name, mode)) if mode == AppMode.MULTI_THREAD: from app_threaded import App app = App(app_name, urls, rule_name) app.schedule(process_count) elif mode == AppMode.THREAD_POOL: from app_pooled import App app = App(app_name, urls, rule_name) app.schedule(process_count) elif mode == AppMode.MULTI_PROCESS: from app_processed import App app = App(app_name, urls, rule_name) app.schedule(process_count) else: logger.error('Not support mode: %s', mode)
def create_app(config_name, github_api=github): app = Flask(__name__) app_config = config.get_app_config(os.environ) app.secret_key = app_config['app_secret'] blueprint = make_github_blueprint( client_id=app_config['github_client_id'], client_secret=app_config['github_client_secret'], scope='public_repo', redirect_url='/replicate' ) app.register_blueprint(blueprint, url_prefix='/replicate') @app.route('/', methods=['GET']) def index(**kwargs): return render_template('index.html'), 200 @app.route('/replicate', methods=['GET']) def replicate(**kwargs): if not github_api.authorized: return redirect(url_for('github.authorized')) user_response = github_api.get('/user') target_user = user_response.json()['login'] response = github_api.post( f'/repos/{app_config["repo_owner"]}/{app_config["repo_name"]}/forks' ) repo_link = config.get_url_to_repo(target_user, app_config['repo_name']) if response.status_code == 202: return render_template('success.html', repo_link=repo_link) else: return render_template('failure.html', response=response.json()) return app
def __init__(self): self.app_name = get_app_config().app_name self.client = Elasticsearch(hosts=self.HOSTS)
import hashlib import pathlib import functools import datetime from common.annotation import time_it from config import * from config import get_app_config from common.log import logger from common.util import now, take_ms if get_app_config().cache_mode == CacheMode.ELASTICSEARCH: from elasticsearch import Elasticsearch class EsClient(object): HOSTS = ['192.168.0.1', '192.168.0.2', '192.168.0.3'] INDEX = 'index-' TYPE = 'default' KEY = 'data' INSTANCE = None MAX_ID_LENGTH = 256 def __init__(self): self.app_name = get_app_config().app_name self.client = Elasticsearch(hosts=self.HOSTS) @staticmethod def get_id(url): return url if len(url) < EsClient.MAX_ID_LENGTH else url[:EsClient.
(多线程)爬虫应用框架,用于创建一个定时抓取任务,只需要指定抓取入口链接和对应的解析规则即可, 后续提取和链接会自动添加到抓取队列,例如: my_app = App('myApp', url='http://www.sina.com', rulename='sina') my_app.schedule() """ def __init__(self, name: str, url: (str, list), rule_name: str): """ 初始化一个抓取应用 :param name: 应用名 :param url: 入口链接 :param rule_name: 入口链接内容对应的解析规则 """ super(App, self).__init__(name, url, rule_name) def start_job(self): job = Job(self.name, self.urls, self.rule_name, self.normal_thread_count) job.start() def schedule(self, normal_thread_count=2): self.normal_thread_count = normal_thread_count self.start_job() if __name__ == '__main__': get_app_config().proxy_mapping = ProxyMapping.of_asdl_high() app = App('sync_app', rule_name='jd_page', url='https://list.jd.hk/list.html?cat=1316,1381,1389&page=1') app.schedule()
from config import get_app_config from foodie_app import create_app if __name__ == '__main__': app = create_app(get_app_config()) app.run(host='127.0.0.1', port=5000)
def is_proxy_url(cls, url): pm = get_app_config().proxy_mapping return pm.get_url_proxy(url) is not None
def put(self, item, block=True, timeout=None): if not get_app_config().no_save: return super().put(item, block, timeout) else: return None
import os import config from app import create_app if __name__ == '__main__': app_config = config.get_app_config(os.environ) app = create_app(app_config['app_config_name']) app.run(host=app_config['host'], port=app_config['port'])