示例#1
0
    def load_domains(cls,
                     session,
                     fn,
                     site_type,
                     ignore_inactive=False,
                     force_inactive=False,
                     ignore_redirected=False,
                     exclusive=False):
        if exclusive:
            # disable existing domains of the same site type
            ob_expr = Site.id.asc()
            msites = get_msites(session, fb_kw=None, ob_expr=ob_expr)
            for site in msites:
                if site.site_type == site_type:
                    cls.disable_site(session, site)
        logger.info('Sending HTTP requests to infer base URLs ...')
        with open(fn, 'r') as f:
            site_tuples = [(n + 1, line) + parse_domain(line, site_type)
                           for n, line in enumerate(f)
                           if not is_comment_line(line)]
        invalid_flag = False
        inactive_flag = False
        redirected_flag = False
        for n, line, site, status in site_tuples:
            line = line.strip('\n')
            if status == 'invalid':
                invalid_flag = True
                logger.error('line %i %r, invalid domain', n, line)
            elif status == 'inactive':
                inactive_flag = True
                logger.warning('line %i %r, domain inactive!', n, line)
            elif status == 'redirected':
                redirected_flag = True
                logger.warning('line %i %r, domain redirected to %s!', n, line,
                               site['base_url'])
        if invalid_flag is True or \
                (inactive_flag is True and (ignore_inactive is False and
                                            force_inactive is False)) or \
                (redirected_flag is True and ignore_redirected is False):
            logger.error("""Please fix the warnings or errors above! \
Edit domains, or use --ignore-redirected to handle redirected domains', \
or Use --ignore-inactive or --force-inactive  to handle inactive domains""")
            raise SystemExit(2)
        for n, line, site, status in site_tuples:
            if status == 'inactive' and ignore_inactive is True:
                continue
            elif status == 'redirected' and ignore_redirected is True:
                continue
            else:
                site['is_enabled'] = True
                get_or_create_m(
                    session, Site, site, fb_uk='domain', onduplicate='update')
                logger.debug('Insert or update site %s', site['domain'])
示例#2
0
    def dump(cls, session, yaml_fn):
        """Dump all sites in the database into a yaml file."""
        ob_expr = Site.id.asc()
        msites = get_msites(session, fb_kw=None, ob_expr=ob_expr)
        r = []
        for ms in msites:
            site = CommentedMap()
            site['name'] = ms.name
            site['domain'] = ms.domain
            site['site_type'] = ms.site_type
            site['base_url'] = ms.base_url
            site['site_tags'] = [
                dict(name=t.name, source=t.source) for t in ms.site_tags
            ]
            site['alternate_domains'] = [
                dict(name=ad.name, is_alive=ad.is_alive)
                for ad in ms.alternate_domains
            ]
            site['is_alive'] = ms.is_alive
            site['is_enabled'] = ms.is_enabled
            article_rules = CommentedMap()
            site['article_rules'] = article_rules
            article_rules['url_regex'] = ms.article_rules['url_regex']
            article_rules['update'] = []
            article_rules['archive'] = []
            for rule in ms.article_rules['update']:
                u = CommentedMap()
                u['spider_name'] = rule['spider_name']
                u['spider_kwargs'] = rule['spider_kwargs']
                article_rules['update'].append(u)
            for rule in ms.article_rules['archive']:
                a = CommentedMap()
                a['spider_name'] = rule['spider_name']
                a['spider_kwargs'] = rule['spider_kwargs']
                article_rules['archive'].append(a)
            r.append(site)

        ys = ruamel.yaml.round_trip_dump(r)
        head_comments = """\
# This file is generate by hoaxy site --dump command.
# To understand the sites data structure, please read sites.readme.md, which
# should locate under hoaxy/data/manuals/.

"""
        # out_put
        with open(yaml_fn, 'w') as f:
            f.write(head_comments + ys)
        logger.info('Sites dumped into YAML file %s', yaml_fn)
示例#3
0
    def run(cls, args):
        """Overriding method as the entry point of this command."""
        try:
            args = cls.args_schema.validate(args)
        except SchemaError as e:
            raise SystemExit(e)

        session = Session(expire_on_commit=False)
        # session = Session()
        where_expr = args['--where-expr']
        ob_expr = args.get('--order-by', 'asc')
        limit = args['--limit']
        # --fetch-url
        if args['--fetch-url'] is True:
            configure_logging('crawl.fetch-url',
                              console_level='DEBUG',
                              file_level='WARNING')
            purpose = 'update' if args['--update'] is True else 'archive'
            if where_expr is None:
                where_expr = [text(DEFAULT_WHERE_EXPR_FETCH_URL)]
            else:
                where_expr = [text(where_expr)]
            ob_expr = Site.id.asc() if ob_expr == 'asc' else Site.id.desc()
            msites = get_msites(session,
                                f_expr=where_expr,
                                ob_expr=ob_expr,
                                limit=limit)
            if len(msites) == 0:
                logger.warning("None sites you queried found in DB!")
                raise SystemExit(2)
            platform_id = get_platform_id(session, name=N_PLATFORM_WEB)
            # detach msites and mplatform from session,
            # since they definitely would not be modified in session
            for ms in msites:
                session.expunge(ms)
            logger.warning('Starting crawling process to fetch URL update ...')
            cls.fetch_url(session, msites, platform_id, purpose)
        elif args['--fetch-html'] is True:
            configure_logging('crawl.fetch-html',
                              console_level='DEBUG',
                              file_level='WARNING')
            if not session.query(Site.id).count() > 0:
                raise SystemExit('Your site table is empty!')
            q = session.query(Url.id, Url.raw)
            if where_expr is None:
                where_expr = [text(DEFAULT_WHERE_EXPR_FETCH_HTML)]
            else:
                where_expr = [text(where_expr)]
            ob_expr = Url.id.asc() if ob_expr == 'asc' else Url.id.desc()
            q = q.filter(*where_expr).order_by(ob_expr)
            if limit is not None:
                q = q.limit(limit)
            logger.info(
                q.statement.compile(compile_kwargs={"literal_binds": True}))
            url_tuples = q.all()
            if not url_tuples:
                logger.warning('No such URLs in DB!')
                raise SystemExit(2)
            logger.warning('Staring crawling process to fetch HTML ...')
            cls.fetch_html(session, url_tuples)
        # --parse-article
        elif args['--parse-article'] is True:
            configure_logging('crawl.parse-article',
                              console_level='DEBUG',
                              file_level='WARNING')
            q = session.query(Url.id, Url.created_at, Url.date_published,
                              Url.canonical, Url.site_id)
            if where_expr is None:
                where_expr = [text(DEFAULT_WHERE_EXPR_PARSE_ARTICLE)]
            else:
                where_expr = [text(where_expr)]
            ob_expr = Url.id.asc() if ob_expr == 'asc' else Url.id.desc()
            q = q.filter(*where_expr).order_by(ob_expr)
            if limit is not None:
                q = q.limit(limit)
            logger.info(
                q.statement.compile(compile_kwargs={"literal_binds": True}))
            url_tuples = q.all()
            if not url_tuples:
                logger.warning('No URLs found from DB!')
                raise SystemExit(2)
            logger.warning('Starting crawling process to parse article ...')
            cls.parse_article(session, url_tuples)
        session.close()
示例#4
0
    def run(cls, args):
        """Overriding method as the entry point of this command."""
        session = Session(expire_on_commit=False)
        # session = Session()
        # expand user home for the file
        if args['<file>'] is not None:
            args['<file>'] = os.path.expanduser(args['<file>'])
        # --load-domains commands
        if args['--load-domains'] is True:
            configure_logging(
                'site.load-domains',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            fn = args.get('<file>', join(HOAXY_HOME, 'domains.txt'))
            logger.info('Loading data from file %r', fn)
            cls.load_domains(
                session,
                fn,
                site_type=args['--site-type'],
                ignore_inactive=args['--ignore-inactive'],
                force_inactive=args['--force-inactive'],
                ignore_redirected=args['--ignore-redirected'],
                exclusive=args['--exclusive'])
        # --load-sites commands
        elif args['--load-sites'] is True:
            configure_logging(
                'site.load-sites',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            fn = args.get('<file>', join(HOAXY_HOME, 'sites.yaml'))
            logger.info('Loading data from file %r', fn)
            cls.load_sites(
                session,
                fn,
                ignore_inactive=args['--ignore-inactive'],
                force_inactive=args['--force-inactive'],
                ignore_redirected=args['--ignore-redirected'])
        # --add commands
        elif args['--add'] is True:
            configure_logging(
                'site.add',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            msite = qquery_msite(session, domain=args['--domain'])
            if msite is not None:
                logger.warning('Site %s already exists!', args['--domain'])
            else:
                cls.add_site(
                    session,
                    domain=args['--domain'],
                    site_type=args['--site-type'],
                    name=args['--name'],
                    tag_source=args['--tag-source'],
                    site_tags=args['--site-tag'],
                    alternate_domains=args['--alternate-domain'],
                    ignore_inactive=args['--ignore-inactive'],
                    force_inactive=args['--force-inactive'],
                    ignore_redirected=args['--ignore-redirected'])
        # --add-site-tags
        elif args['--add-site-tags'] is True:
            configure_logging(
                'site.add-site-tags',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            if args['--name'] is not None:
                site_identity = args['--name']
            else:
                site_identity = args['--domain']
            msite = qquery_msite(
                session, name=args['--name'], domain=args['--domain'])
            if msite is None:
                logger.warning('Site %s does not exist!', site_identity)
            else:
                cls.add_site_tags(session, msite, args['--tag-source'],
                                  args['--site-tag'])
        # --replace-site-tags
        elif args['--replace-site-tags'] is True:
            configure_logging(
                'site.repalce-site-tags',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            if args['--name'] is not None:
                site_identity = args['--name']
            else:
                site_identity = args['--domain']
            msite = qquery_msite(
                session, name=args['--name'], domain=args['--domain'])
            if msite is None:
                logger.warning('Site %s does not exist!', site_identity)
            else:
                cls.replace_site_tags(session, msite, args['--tag-source'],
                                      args['--site-tag'])
        # --add-alternate-domains
        elif args['--add-alternate-domains'] is True:
            configure_logging(
                'site.add-alternate-domains',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            if args['--name'] is not None:
                site_identity = args['--name']
            else:
                site_identity = args['--domain']
            msite = qquery_msite(
                session, name=args['--name'], domain=args['--domain'])
            if msite is None:
                logger.warning('Site %s does not exist!', site_identity)
            else:
                cls.add_alternate_domains(session, msite,
                                          args['--alternate-domain'])
        # --replace-alternate-domains
        elif args['--replace-alternate-domains'] is True:
            configure_logging(
                'site.replace-alternate-domains',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            if args['--name'] is not None:
                site_identity = args['--name']
            else:
                site_identity = args['--domain']
            msite = qquery_msite(
                session, name=args['--name'], domain=args['--domain'])
            if msite is None:
                logger.warning('Site %s does not exist!', site_identity)
            else:
                cls.replace_alternate_domains(session, msite,
                                              args['--alternate-domain'])
        elif args['--disable'] is True:
            configure_logging(
                'site.disable',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            if args['--name'] is not None:
                site_identity = args['--name']
            else:
                site_identity = args['--domain']
            msite = qquery_msite(
                session, name=args['--name'], domain=args['--domain'])
            if msite is None:
                logger.warning('Site %s does not exist!', site_identity)
            else:
                cls.disable_site(session, msite)
        elif args['--enable'] is True:
            configure_logging(
                'site.enable',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            if args['--name'] is not None:
                site_identity = args['--name']
            else:
                site_identity = args['--domain']
            msite = qquery_msite(
                session, name=args['--name'], domain=args['--domain'])
            if msite is None:
                logger.warning('Site %s does not exist!', site_identity)
            else:
                cls.enable_site(session, msite)
        # bulk enable sites and domains
        elif args['--bulk-enable'] is True:
            configure_logging(
                'site.bulk-enable',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            if args['--exclusive'] is True:
                ob_expr = Site.id.asc()
                msites = get_msites(session, fb_kw=None, ob_expr=ob_expr)
                # disable existing sites
                for existing_site in msites:
                    cls.disable_site(session, existing_site)
            if args['--names'] is not None:
                site_list = args['--names']
                for site in site_list:
                    msite = qquery_msite(session, name=site, domain=None)
                    if msite is None:
                        logger.warning('Site %s does not exist!', site)
                    else:
                        cls.enable_site(session, msite)
            else:
                domain_list = args['--domains']
                for domain in domain_list:
                    msite = qquery_msite(session, name=None, domain=domain)
                    if msite is None:
                        logger.warning('Site %s does not exist!', domain)
                    else:
                        cls.enable_site(session, msite)

        # bulk disable sites and domains
        elif args['--bulk-disable'] is True:
            configure_logging(
                'site.bulk-disable',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            if args['--names'] is not None:
                site_list = args['--names']
                for site in site_list:
                    msite = qquery_msite(session, name=site, domain=None)
                    if msite is None:
                        logger.warning('Site %s does not exist!', site)
                    else:
                        cls.disable_site(session, msite)
            else:
                domain_list = args['--domains']
                for domain in domain_list:
                    msite = qquery_msite(session, name=None, domain=domain)
                    if msite is None:
                        logger.warning('Site %s does not exist!', domain)
                    else:
                        cls.disable_site(session, msite)
        # --status
        elif args['--status'] is True:
            configure_logging(
                'site.status',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            if args['--include-disabled'] is True:
                cls.site_status(session, True)
            else:
                cls.site_status(session, False)
        # --dump
        elif args['--dump'] is True:
            configure_logging(
                'site.status',
                console_level=args['--console-log-level'],
                file_level='INFO')
            cls.dump(session, args['<file>'])

        session.close()