def run(cls, args): """Overriding method as the entry point of this command.""" try: args = cls.args_schema.validate(args) except SchemaError as e: raise SystemExit(e) session = Session(expire_on_commit=False) # session = Session() where_expr = args['--where-expr'] ob_expr = args.get('--order-by', 'asc') limit = args['--limit'] # --fetch-url if args['--fetch-url'] is True: configure_logging('crawl.fetch-url', console_level='DEBUG', file_level='WARNING') purpose = 'update' if args['--update'] is True else 'archive' if where_expr is None: where_expr = [text(DEFAULT_WHERE_EXPR_FETCH_URL)] else: where_expr = [text(where_expr)] ob_expr = Site.id.asc() if ob_expr == 'asc' else Site.id.desc() msites = get_msites(session, f_expr=where_expr, ob_expr=ob_expr, limit=limit) if len(msites) == 0: logger.warning("None sites you queried found in DB!") raise SystemExit(2) platform_id = get_platform_id(session, name=N_PLATFORM_WEB) # detach msites and mplatform from session, # since they definitely would not be modified in session for ms in msites: session.expunge(ms) logger.warning('Starting crawling process to fetch URL update ...') cls.fetch_url(session, msites, platform_id, purpose) elif args['--fetch-html'] is True: configure_logging('crawl.fetch-html', console_level='DEBUG', file_level='WARNING') if not session.query(Site.id).count() > 0: raise SystemExit('Your site table is empty!') q = session.query(Url.id, Url.raw) if where_expr is None: where_expr = [text(DEFAULT_WHERE_EXPR_FETCH_HTML)] else: where_expr = [text(where_expr)] ob_expr = Url.id.asc() if ob_expr == 'asc' else Url.id.desc() q = q.filter(*where_expr).order_by(ob_expr) if limit is not None: q = q.limit(limit) logger.info( q.statement.compile(compile_kwargs={"literal_binds": True})) url_tuples = q.all() if not url_tuples: logger.warning('No such URLs in DB!') raise SystemExit(2) logger.warning('Staring crawling process to fetch HTML ...') cls.fetch_html(session, url_tuples) # --parse-article elif args['--parse-article'] is True: configure_logging('crawl.parse-article', console_level='DEBUG', file_level='WARNING') q = session.query(Url.id, Url.created_at, Url.date_published, Url.canonical, Url.site_id) if where_expr is None: where_expr = [text(DEFAULT_WHERE_EXPR_PARSE_ARTICLE)] else: where_expr = [text(where_expr)] ob_expr = Url.id.asc() if ob_expr == 'asc' else Url.id.desc() q = q.filter(*where_expr).order_by(ob_expr) if limit is not None: q = q.limit(limit) logger.info( q.statement.compile(compile_kwargs={"literal_binds": True})) url_tuples = q.all() if not url_tuples: logger.warning('No URLs found from DB!') raise SystemExit(2) logger.warning('Starting crawling process to parse article ...') cls.parse_article(session, url_tuples) session.close()
def run(cls, args): """Overriding method as the entry point of this command.""" session = Session() if args['--volume'] is True: configure_logging('report.volume', console_level=args['--console-log-level'], file_level='WARNING') table_names = ['tweet', 'url', 'article'] table = args['--table'] if table not in table_names: logger.critical('Available tables are: %s', table_names) sys.exit(2) interval_names = [ 'minute', 'hour', 'day', 'week', 'month', 'quarter', 'year' ] interval = args['--interval'] if interval not in interval_names: logger.critical('Available intervals are: %s', interval_names) sys.exit(2) limit = args['--limit'] if int(limit) <= 0: logger.critical('%r should larger than 0', limit) sys.exit(2) sql = """ SELECT count(id) as agg_num, date_trunc(:interval, created_at) as interval FROM %s GROUP BY interval ORDER BY interval DESC LIMIT :limit""" % table stmt = text(sql).bindparams(interval=interval, limit=limit) strf = '%Y-%m-%d %H:%M:%S' with ENGINE.connect() as conn: result = conn.execute(stmt).fetchall() print(('-' * 35)) print(('{0:^20s} | {1:12s}'.format('Timeline (%s)' % interval, 'Aggregation'))) print(('-' * 35)) for v, t in result: print(('{0:^20s} | {1:8d}'.format(t.strftime(strf), v))) print(('-' * 35)) elif args['--status']: configure_logging('report.streaming-status', console_level=args['--console-log-level']) table_name = None if args['--status'] == 'twitter': table_name = 'tweet' if table_name is None: logger.critical('SNS %r has not been implemented!', args['--status']) sys.exit(2) sql = 'SELECT created_at FROM {} ORDER BY id DESC LIMIT 1'.format( 'tweet') with ENGINE.connect() as conn: most_recent, = conn.execute(text(sql)).fetchone() delta_minutes = 30 delta = timedelta(minutes=delta_minutes) current_utc = datetime.utcnow() if current_utc - most_recent > delta: logger.critical( 'No %s streaming update in the past %s minutes!', args['--status'], delta_minutes) else: logger.info('Most recent %s streaming update is %s', args['--status'], str(most_recent) + ' (UTC)') elif args['--top-spreader'] is True: configure_logging('report.top-spreaders', console_level=args['--console-log-level'], file_level='WARNING') # try to create table if (Top20SpreaderMonthly.__table__.exists(bind=ENGINE)) is False: Top20SpreaderMonthly.__table__.create(bind=ENGINE) if args['--force-today'] is True: upper_day = datetime.utcnow().date() elif args['--upper-day'] is None: upper_day = datetime.utcnow().date() - timedelta(days=1) else: try: upper_day = parse(args['--upper-day']).date() except Exception: raise ValueError('Invalid date: %s', args['--upper-day']) if args['--generate'] is True: logger.warning( 'Generating top spreaders for uppder_day=%r ...', upper_day) cls.generate_top_spreaders(session, upper_day) elif args['--look-up'] is True: cls.look_up_top_spreaders(session, upper_day, args['--most-recent']) elif args['--top-article'] is True: configure_logging('report.top-article', console_level=args['--console-log-level'], file_level='WARNING') # try to create table if (Top20ArticleMonthly.__table__.exists(bind=ENGINE)) is False: Top20ArticleMonthly.__table__.create(bind=ENGINE) if args['--force-today'] is True: upper_day = datetime.utcnow().date() elif args['--upper-day'] is None: upper_day = datetime.utcnow().date() - timedelta(days=1) else: try: upper_day = parse(args['--upper-day']).date() except Exception: raise ValueError('Invalid date: %s', args['--upper-day']) if args['--generate'] is True: logger.warning('Generating top articles for uppder_day=%r ...', upper_day) cls.generate_top_articles(session, upper_day) elif args['--look-up'] is True: cls.look_up_top_articles(session, upper_day, args['--most-recent']) session.close()
def run(cls, args): """Overriding method as the entry point of this command.""" session = Session(expire_on_commit=False) # session = Session() # expand user home for the file if args['<file>'] is not None: args['<file>'] = os.path.expanduser(args['<file>']) # --load-domains commands if args['--load-domains'] is True: configure_logging( 'site.load-domains', console_level=args['--console-log-level'], file_level='WARNING') fn = args.get('<file>', join(HOAXY_HOME, 'domains.txt')) logger.info('Loading data from file %r', fn) cls.load_domains( session, fn, site_type=args['--site-type'], ignore_inactive=args['--ignore-inactive'], force_inactive=args['--force-inactive'], ignore_redirected=args['--ignore-redirected']) # --load-sites commands elif args['--load-sites'] is True: configure_logging( 'site.load-sites', console_level=args['--console-log-level'], file_level='WARNING') fn = args.get('<file>', join(HOAXY_HOME, 'sites.yaml')) logger.info('Loading data from file %r', fn) cls.load_sites( session, fn, ignore_inactive=args['--ignore-inactive'], force_inactive=args['--force-inactive'], ignore_redirected=args['--ignore-redirected']) # --add commands elif args['--add'] is True: configure_logging( 'site.add', console_level=args['--console-log-level'], file_level='WARNING') msite = qquery_msite(session, domain=args['--domain']) if msite is not None: logger.warning('Site %s already exists!', args['--domain']) else: cls.add_site( session, domain=args['--domain'], site_type=args['--site-type'], name=args['--name'], tag_source=args['--tag-source'], site_tags=args['--site-tag'], alternate_domains=args['--alternate-domain'], ignore_inactive=args['--ignore-inactive'], force_inactive=args['--force-inactive'], ignore_redirected=args['--ignore-redirected']) # --add-site-tags elif args['--add-site-tags'] is True: configure_logging( 'site.add-site-tags', console_level=args['--console-log-level'], file_level='WARNING') if args['--name'] is not None: site_identity = args['--name'] else: site_identity = args['--domain'] msite = qquery_msite( session, name=args['--name'], domain=args['--domain']) if msite is None: logger.warning('Site %s does not exist!', site_identity) else: cls.add_site_tags(session, msite, args['--tag-source'], args['--site-tag']) # --replace-site-tags elif args['--replace-site-tags'] is True: configure_logging( 'site.repalce-site-tags', console_level=args['--console-log-level'], file_level='WARNING') if args['--name'] is not None: site_identity = args['--name'] else: site_identity = args['--domain'] msite = qquery_msite( session, name=args['--name'], domain=args['--domain']) if msite is None: logger.warning('Site %s does not exist!', site_identity) else: cls.replace_site_tags(session, msite, args['--tag-source'], args['--site-tag']) # --add-alternate-domains elif args['--add-alternate-domains'] is True: configure_logging( 'site.add-alternate-domains', console_level=args['--console-log-level'], file_level='WARNING') if args['--name'] is not None: site_identity = args['--name'] else: site_identity = args['--domain'] msite = qquery_msite( session, name=args['--name'], domain=args['--domain']) if msite is None: logger.warning('Site %s does not exist!', site_identity) else: cls.add_alternate_domains(session, msite, args['--alternate-domain']) # --replace-alternate-domains elif args['--replace-alternate-domains'] is True: configure_logging( 'site.replace-alternate-domains', console_level=args['--console-log-level'], file_level='WARNING') if args['--name'] is not None: site_identity = args['--name'] else: site_identity = args['--domain'] msite = qquery_msite( session, name=args['--name'], domain=args['--domain']) if msite is None: logger.warning('Site %s does not exist!', site_identity) else: cls.replace_alternate_domains(session, msite, args['--alternate-domain']) elif args['--disable'] is True: configure_logging( 'site.disable', console_level=args['--console-log-level'], file_level='WARNING') if args['--name'] is not None: site_identity = args['--name'] else: site_identity = args['--domain'] msite = qquery_msite( session, name=args['--name'], domain=args['--domain']) if msite is None: logger.warning('Site %s does not exist!', site_identity) else: cls.disable_site(session, msite) elif args['--enable'] is True: configure_logging( 'site.enable', console_level=args['--console-log-level'], file_level='WARNING') if args['--name'] is not None: site_identity = args['--name'] else: site_identity = args['--domain'] msite = qquery_msite( session, name=args['--name'], domain=args['--domain']) if msite is None: logger.warning('Site %s does not exist!', site_identity) else: cls.enable_site(session, msite) # --status elif args['--status'] is True: configure_logging( 'site.status', console_level=args['--console-log-level'], file_level='WARNING') if args['--include-disabled'] is True: cls.site_status(session, True) else: cls.site_status(session, False) # --dump elif args['--dump'] is True: configure_logging( 'site.status', console_level=args['--console-log-level'], file_level='INFO') cls.dump(session, args['<file>']) session.close()