示例#1
0
 def __init__(self, name: str, url: (str, list), rule_name: str):
     """
     初始化一个抓取应用
     :param name: 应用名
     :param url: 入口链接
     :param rule_name: 入口链接内容对应的解析规则
     """
     self.name = name
     get_app_config().app_name = name
     self.urls = [Url(i)
                  for i in url] if isinstance(url, list) else [Url(url)]
     self.rule_name = rule_name
     self.current_job = None
     self.normal_thread_count = 1
     self.proxy_thread_count = 1
     self.config = get_app_config()
示例#2
0
    def load(cls, url: Url, retry=25):
        """
        下载URL链接,返回下载的内容;
        先从缓存查询该链接(本地文件缓存或者ElasticSearch缓存),如果缓存存在,直接返回缓存内容
        :param url:
        :param retry:
        :return:
        """
        ok, result = False, None
        referer = url.referer
        url = url.value

        if referer:
            headers['referer'] = referer

        # 代理设置
        app_config = get_app_config()
        proxies = app_config.proxy_mapping.get_url_proxy(url)
        logger.info('Start request url: {}, proxy: {}'.format(url, proxies))

        for i in range(retry):
            from_t = datetime.datetime.now()
            logger.info('[{}] request url: {}, proxy: {}'.format(i, url, proxies))
            try:
                dt = (datetime.datetime.now() - from_t).microseconds // 1000
                # 根据配置选择是否使用会话
                if app_config.use_session:
                    session = get_current_thread_session()
                    session.proxies = proxies
                    # 使用会话抓取,自动处理天猫的302跳转, 中间可能有多个302跳转,所以不要设置超时时间
                    res = session.get(url, timeout=120)
                else:
                    res = requests.get(url, headers=headers, proxies=proxies)

                if res.ok:
                    logger.info('[{}] request url success, takes: {} ms, size:{}, {}'.format(i, dt, len(res.text), url))
                    if res.encoding == 'ISO-8859-1':
                        result = res.content.decode('utf8')
                    elif res.encoding is None and res.apparent_encoding == 'ISO-8859-1':
                        result = res.content.decode('gb2312')
                    else:
                        result = res.text if res.encoding in (
                            'gbk', 'GBK', None, 'gb2312', 'ISO-8859-1') else res.content.decode('utf8')

                    # 根据配置检查是否是正常的返回内容,如果不是,重新抓取
                    if app_config.fail_conditions.test(url, result):
                        ok = True
                        break
                    else:
                        # 被反扒,等待4s 代理地址切换
                        time.sleep(4)
                else:
                    logger.info('[{}] request url failed, takes {}ms, code:{}-{}'.format(i, dt, res.status_code, res.reason))
                    time.sleep(0.5)
            except Exception as e:
                logger.error('[{}] request url failed, error: {}'.format(i, e))
                # import traceback
                # traceback.print_exc()
            time.sleep(0.1)
        return ok, result
示例#3
0
def create_app(env: str = None):
    """
    Factory function sits atop the application
    :return: Flask application
    """
    if env is None:
        env = os.getenv('FLASK_ENV', 'development')

    app = Flask(__name__)
    app.config.from_object(get_app_config(env))
    app.logger = structlog.get_logger()

    with app.app_context():
        es_uri = app.config.get('ELASTICSEARCH_URL')
        if es_uri:
            # setup conn
            es = get_elasticsearch_client(es_uri)
            # create index
            index_name = os.getenv('ELASTICSEARCH_INDEX_NAME')
            es.indices.create(index=index_name, ignore=400)
            # store elasticsearch client into global namespace
            app.elasticsearch = es

        register_extensions(app)
        register_blueprints(app)
        register_errorhandlers(app)
        register_jinja_env(app)

    return app
示例#4
0
def create_app(app_config=None):
    app = Flask(__name__)
    CORS(app)

    # setting foodie_app config
    app_config = app_config if app_config else get_app_config()
    app.config.from_object(app_config)

    # init db
    db.init_app(app)
    migrate.init_app(app, db)

    # register blueprints
    app.register_blueprint(api)

    return app
示例#5
0
def start_app(app_name: str, urls: list, rule_name: str, process_count=3):
    mode = get_app_config().app_mode
    logger.info('Start app [{}] with mode [{}]'.format(app_name, mode))
    if mode == AppMode.MULTI_THREAD:
        from app_threaded import App
        app = App(app_name, urls, rule_name)
        app.schedule(process_count)
    elif mode == AppMode.THREAD_POOL:
        from app_pooled import App
        app = App(app_name, urls, rule_name)
        app.schedule(process_count)
    elif mode == AppMode.MULTI_PROCESS:
        from app_processed import App
        app = App(app_name, urls, rule_name)
        app.schedule(process_count)
    else:
        logger.error('Not support mode: %s', mode)
示例#6
0
def create_app(config_name, github_api=github):
    app = Flask(__name__)

    app_config = config.get_app_config(os.environ)
    app.secret_key = app_config['app_secret']
    blueprint = make_github_blueprint(
        client_id=app_config['github_client_id'],
        client_secret=app_config['github_client_secret'],
        scope='public_repo',
        redirect_url='/replicate'
    )
    app.register_blueprint(blueprint, url_prefix='/replicate')

    @app.route('/', methods=['GET'])
    def index(**kwargs):
        return render_template('index.html'), 200

    @app.route('/replicate', methods=['GET'])
    def replicate(**kwargs):
        if not github_api.authorized:
            return redirect(url_for('github.authorized'))

        user_response = github_api.get('/user')
        target_user = user_response.json()['login']

        response = github_api.post(
            f'/repos/{app_config["repo_owner"]}/{app_config["repo_name"]}/forks'
        )
        repo_link = config.get_url_to_repo(target_user, app_config['repo_name'])

        if response.status_code == 202:
            return render_template('success.html', repo_link=repo_link)
        else:
            return render_template('failure.html', response=response.json())

    return app
示例#7
0
 def __init__(self):
     self.app_name = get_app_config().app_name
     self.client = Elasticsearch(hosts=self.HOSTS)
示例#8
0
import hashlib
import pathlib
import functools
import datetime

from common.annotation import time_it
from config import *
from config import get_app_config
from common.log import logger
from common.util import now, take_ms

if get_app_config().cache_mode == CacheMode.ELASTICSEARCH:
    from elasticsearch import Elasticsearch


class EsClient(object):
    HOSTS = ['192.168.0.1', '192.168.0.2', '192.168.0.3']
    INDEX = 'index-'
    TYPE = 'default'
    KEY = 'data'

    INSTANCE = None
    MAX_ID_LENGTH = 256

    def __init__(self):
        self.app_name = get_app_config().app_name
        self.client = Elasticsearch(hosts=self.HOSTS)

    @staticmethod
    def get_id(url):
        return url if len(url) < EsClient.MAX_ID_LENGTH else url[:EsClient.
示例#9
0
    (多线程)爬虫应用框架,用于创建一个定时抓取任务,只需要指定抓取入口链接和对应的解析规则即可,
    后续提取和链接会自动添加到抓取队列,例如:
    my_app = App('myApp', url='http://www.sina.com', rulename='sina')
    my_app.schedule()
    """
    def __init__(self, name: str, url: (str, list), rule_name: str):
        """
        初始化一个抓取应用
        :param name: 应用名
        :param url: 入口链接
        :param rule_name: 入口链接内容对应的解析规则
        """
        super(App, self).__init__(name, url, rule_name)

    def start_job(self):
        job = Job(self.name, self.urls, self.rule_name,
                  self.normal_thread_count)
        job.start()

    def schedule(self, normal_thread_count=2):
        self.normal_thread_count = normal_thread_count
        self.start_job()


if __name__ == '__main__':
    get_app_config().proxy_mapping = ProxyMapping.of_asdl_high()
    app = App('sync_app',
              rule_name='jd_page',
              url='https://list.jd.hk/list.html?cat=1316,1381,1389&page=1')
    app.schedule()
示例#10
0
文件: run.py 项目: sh3raawii/foodie
from config import get_app_config
from foodie_app import create_app

if __name__ == '__main__':
    app = create_app(get_app_config())
    app.run(host='127.0.0.1', port=5000)
示例#11
0
 def is_proxy_url(cls, url):
     pm = get_app_config().proxy_mapping
     return pm.get_url_proxy(url) is not None
示例#12
0
 def put(self, item, block=True, timeout=None):
     if not get_app_config().no_save:
         return super().put(item, block, timeout)
     else:
         return None
示例#13
0
import os

import config

from app import create_app

if __name__ == '__main__':
    app_config = config.get_app_config(os.environ)
    app = create_app(app_config['app_config_name'])
    app.run(host=app_config['host'], port=app_config['port'])