def __init__(self, session, url_tuples, *args, **kwargs): """Constructor of ArticleParserSpider. Parameters ---------- session : obj A SQLAlchemy session instance. url_tuples : list A list of tuple (id, created_at, date_published, canonical_url, site_id), which is a URL collection fetched from database. node_path : string node executable path. mercury_parser_installation_path : string pwd of <hoaxy-backened>/hoaxy/node_scripts/parse_with_mercury.js. """ self.session = session self.url_tuples = url_tuples self.node_path = kwargs.pop('node_path') self.mercury_parser_installation_path = kwargs.pop('mercury_parser_path') configure_logging( 'crawl.parse-article', console_level='CRITICAL', file_level='WARNING') super(ArticleParserSpider, self).__init__(*args, **kwargs)
def run(cls, args): """Overriding method as the entry point of this command.""" session = Session(expire_on_commit=False) if args['--twitter-streaming'] is True: configure_logging('twitter.streaming', console_level=args['--console-log-level']) cls.twitter_stream(session, args)
def init(cls, session, force_drop, ignore_inactive=False, force_inactive=False, ignore_redirected=False): configure_logging('init', console_level='INFO', file_level='WARNING') dt_before = datetime.utcnow() logging.info('Creating database tables:') if force_drop is True: logging.warning('Existed tables would be dropped and recreated!') Base.metadata.drop_all(ENGINE) else: logging.warning('Ignore existed tables') Base.metadata.create_all(ENGINE) logging.info('Inserting platforms if not exist') get_or_create_m(session, Platform, TWITTER_PLATFORM_DICT, fb_uk='name') get_or_create_m(session, Platform, WEB_PLATFORM_DICT, fb_uk='name') logging.info('Trying to load site data:') dc_file = join(HOAXY_HOME, 'domains_claim.txt') df_file = join(HOAXY_HOME, 'domains_factchecking.txt') site_file = join(HOAXY_HOME, 'sites.yaml') if isfile(dc_file) is True: logging.info('Claim domains %s found', dc_file) SiteCmd.load_domains(session, dc_file, site_type='claim', ignore_inactive=ignore_inactive, force_inactive=force_inactive, ignore_redirected=ignore_redirected) else: logging.info('Claim domains %s not found', dc_file) if isfile(df_file) is True: logging.info('Fact checking domains %s found', df_file) SiteCmd.load_domains(session, df_file, site_type='fact_checking', ignore_inactive=ignore_inactive, force_inactive=force_inactive, ignore_redirected=ignore_redirected) else: logging.info('Fact checking domains %s not found', df_file) if isfile(site_file) is True: logging.info('Site file %s found', site_file) SiteCmd.load_sites(session, site_file, ignore_inactive=ignore_inactive, force_inactive=force_inactive, ignore_redirected=ignore_redirected) else: logging.info('Site file %s not found', site_file) sites = session.query(Site.domain, Site.site_type, Site.base_url ).filter(or_( Site.created_at > dt_before, Site.updated_at > dt_before )).order_by(Site.id).all() logger.info("Added or updated sites are:\n %s", pprint.pformat(sites)) logger.info("Done.")
def run(cls, args): try: # print(args) args = cls.args_schema.validate(args) except SchemaError as e: sys.exit(e) session = Session() # make sure lucene be inited lucene.initVM() lucene.getVMEnv().attachCurrentThread() if args['--index'] is True: configure_logging( 'lucene.index', console_level=args['--console-log-level']) mgid = get_or_create_m( session, MetaInfo, data=dict( name='article_group_id_lucene_index', value='0', value_type='int', description='article.group_id used for lucene index'), fb_uk='name') if args['--mode'] == 'create': mgid.set_value(0) session.commit() logger.debug('Indexing started.. Getting articles..') q = """ SELECT DISTINCT ON (a.group_id) a.id, a.group_id, a.canonical_url, a.title, a.meta, a.content, coalesce(a.date_published, a.date_captured) AS pd, s.domain, s.site_type FROM article AS a JOIN site AS s ON s.id=a.site_id WHERE a.site_id IS NOT NULL AND s.is_enabled IS TRUE AND a.group_id>:gid ORDER BY group_id, pd ASC """ articles_iter = session.execute( sqlalchemy.text(q).bindparams(gid=mgid.get_value())) cls.index(session, args['--mode'], articles_iter, mgid) elif args['--search'] is True: configure_logging( 'lucene.search', console_level=args['--console-log-level']) cls.search(args['--query'], args['--top']) else: print("Unrecognized command!") sys.exit(2)
def run(cls, args): """Overriding method as the entry point of this command.""" try: args = cls.args_schema.validate(args) except SchemaError as e: raise SystemExit('\n' + e + '\n') session = Session(expire_on_commit=False) if args['--twitter-streaming'] is True: configure_logging('twitter.streaming') cls.twitter_stream(session, args) elif args['--load-tweets'] is True: configure_logging('twitter.load-tweets') cls.load_tweets(session, args) elif args['--reparse-db-tweets'] is True: configure_logging('twitter.reparse-db', file_level='WARNING') cls._test_table_names(session, args) cls.reparse_db(session, args)
def run(cls, args): """Overriding method as the entry point of this command.""" session = Session() if args['--volume'] is True: configure_logging('report.volume', console_level=args['--console-log-level'], file_level='WARNING') table_names = ['tweet', 'url', 'article'] table = args['--table'] if table not in table_names: logger.critical('Available tables are: %s', table_names) sys.exit(2) interval_names = [ 'minute', 'hour', 'day', 'week', 'month', 'quarter', 'year' ] interval = args['--interval'] if interval not in interval_names: logger.critical('Available intervals are: %s', interval_names) sys.exit(2) limit = args['--limit'] if int(limit) <= 0: logger.critical('%r should larger than 0', limit) sys.exit(2) sql = """ SELECT count(id) as agg_num, date_trunc(:interval, created_at) as interval FROM %s GROUP BY interval ORDER BY interval DESC LIMIT :limit""" % table stmt = text(sql).bindparams(interval=interval, limit=limit) strf = '%Y-%m-%d %H:%M:%S' with ENGINE.connect() as conn: result = conn.execute(stmt).fetchall() print(('-' * 35)) print(('{0:^20s} | {1:12s}'.format('Timeline (%s)' % interval, 'Aggregation'))) print(('-' * 35)) for v, t in result: print(('{0:^20s} | {1:8d}'.format(t.strftime(strf), v))) print(('-' * 35)) elif args['--status']: configure_logging('report.streaming-status', console_level=args['--console-log-level']) table_name = None if args['--status'] == 'twitter': table_name = 'tweet' if table_name is None: logger.critical('SNS %r has not been implemented!', args['--status']) sys.exit(2) sql = 'SELECT created_at FROM {} ORDER BY id DESC LIMIT 1'.format( 'tweet') with ENGINE.connect() as conn: most_recent, = conn.execute(text(sql)).fetchone() delta_minutes = 30 delta = timedelta(minutes=delta_minutes) current_utc = datetime.utcnow() if current_utc - most_recent > delta: logger.critical( 'No %s streaming update in the past %s minutes!', args['--status'], delta_minutes) else: logger.info('Most recent %s streaming update is %s', args['--status'], str(most_recent) + ' (UTC)') elif args['--top-spreader'] is True: configure_logging('report.top-spreaders', console_level=args['--console-log-level'], file_level='WARNING') # try to create table if (Top20SpreaderMonthly.__table__.exists(bind=ENGINE)) is False: Top20SpreaderMonthly.__table__.create(bind=ENGINE) if args['--force-today'] is True: upper_day = datetime.utcnow().date() elif args['--upper-day'] is None: upper_day = datetime.utcnow().date() - timedelta(days=1) else: try: upper_day = parse(args['--upper-day']).date() except Exception: raise ValueError('Invalid date: %s', args['--upper-day']) if args['--generate'] is True: logger.warning( 'Generating top spreaders for uppder_day=%r ...', upper_day) cls.generate_top_spreaders(session, upper_day) elif args['--look-up'] is True: cls.look_up_top_spreaders(session, upper_day, args['--most-recent']) elif args['--top-article'] is True: configure_logging('report.top-article', console_level=args['--console-log-level'], file_level='WARNING') # try to create table if (Top20ArticleMonthly.__table__.exists(bind=ENGINE)) is False: Top20ArticleMonthly.__table__.create(bind=ENGINE) if args['--force-today'] is True: upper_day = datetime.utcnow().date() elif args['--upper-day'] is None: upper_day = datetime.utcnow().date() - timedelta(days=1) else: try: upper_day = parse(args['--upper-day']).date() except Exception: raise ValueError('Invalid date: %s', args['--upper-day']) if args['--generate'] is True: logger.warning('Generating top articles for uppder_day=%r ...', upper_day) cls.generate_top_articles(session, upper_day) elif args['--look-up'] is True: cls.look_up_top_articles(session, upper_day, args['--most-recent']) session.close()
def run(cls, args): """Overriding method as the entry point of this command.""" try: args = cls.args_schema.validate(args) except SchemaError as e: raise SystemExit(e) session = Session(expire_on_commit=False) # session = Session() where_expr = args['--where-expr'] ob_expr = args.get('--order-by', 'asc') limit = args['--limit'] # --fetch-url if args['--fetch-url'] is True: configure_logging('crawl.fetch-url', console_level='DEBUG', file_level='WARNING') purpose = 'update' if args['--update'] is True else 'archive' if where_expr is None: where_expr = [text(DEFAULT_WHERE_EXPR_FETCH_URL)] else: where_expr = [text(where_expr)] ob_expr = Site.id.asc() if ob_expr == 'asc' else Site.id.desc() msites = get_msites(session, f_expr=where_expr, ob_expr=ob_expr, limit=limit) if len(msites) == 0: logger.warning("None sites you queried found in DB!") raise SystemExit(2) platform_id = get_platform_id(session, name=N_PLATFORM_WEB) # detach msites and mplatform from session, # since they definitely would not be modified in session for ms in msites: session.expunge(ms) logger.warning('Starting crawling process to fetch URL update ...') cls.fetch_url(session, msites, platform_id, purpose) elif args['--fetch-html'] is True: configure_logging('crawl.fetch-html', console_level='DEBUG', file_level='WARNING') if not session.query(Site.id).count() > 0: raise SystemExit('Your site table is empty!') q = session.query(Url.id, Url.raw) if where_expr is None: where_expr = [text(DEFAULT_WHERE_EXPR_FETCH_HTML)] else: where_expr = [text(where_expr)] ob_expr = Url.id.asc() if ob_expr == 'asc' else Url.id.desc() q = q.filter(*where_expr).order_by(ob_expr) if limit is not None: q = q.limit(limit) logger.info( q.statement.compile(compile_kwargs={"literal_binds": True})) url_tuples = q.all() if not url_tuples: logger.warning('No such URLs in DB!') raise SystemExit(2) logger.warning('Staring crawling process to fetch HTML ...') cls.fetch_html(session, url_tuples) # --parse-article elif args['--parse-article'] is True: configure_logging('crawl.parse-article', console_level='DEBUG', file_level='WARNING') q = session.query(Url.id, Url.created_at, Url.date_published, Url.canonical, Url.site_id) if where_expr is None: where_expr = [text(DEFAULT_WHERE_EXPR_PARSE_ARTICLE)] else: where_expr = [text(where_expr)] ob_expr = Url.id.asc() if ob_expr == 'asc' else Url.id.desc() q = q.filter(*where_expr).order_by(ob_expr) if limit is not None: q = q.limit(limit) logger.info( q.statement.compile(compile_kwargs={"literal_binds": True})) url_tuples = q.all() if not url_tuples: logger.warning('No URLs found from DB!') raise SystemExit(2) logger.warning('Starting crawling process to parse article ...') cls.parse_article(session, url_tuples) session.close()
def setup_logging(): """Before first request, set up logger.""" configure_logging('api', file_level='WARNING')
def run(cls, args): """Overriding method as the entry point of this command.""" session = Session(expire_on_commit=False) # session = Session() # expand user home for the file if args['<file>'] is not None: args['<file>'] = os.path.expanduser(args['<file>']) # --load-domains commands if args['--load-domains'] is True: configure_logging( 'site.load-domains', console_level=args['--console-log-level'], file_level='WARNING') fn = args.get('<file>', join(HOAXY_HOME, 'domains.txt')) logger.info('Loading data from file %r', fn) cls.load_domains( session, fn, site_type=args['--site-type'], ignore_inactive=args['--ignore-inactive'], force_inactive=args['--force-inactive'], ignore_redirected=args['--ignore-redirected']) # --load-sites commands elif args['--load-sites'] is True: configure_logging( 'site.load-sites', console_level=args['--console-log-level'], file_level='WARNING') fn = args.get('<file>', join(HOAXY_HOME, 'sites.yaml')) logger.info('Loading data from file %r', fn) cls.load_sites( session, fn, ignore_inactive=args['--ignore-inactive'], force_inactive=args['--force-inactive'], ignore_redirected=args['--ignore-redirected']) # --add commands elif args['--add'] is True: configure_logging( 'site.add', console_level=args['--console-log-level'], file_level='WARNING') msite = qquery_msite(session, domain=args['--domain']) if msite is not None: logger.warning('Site %s already exists!', args['--domain']) else: cls.add_site( session, domain=args['--domain'], site_type=args['--site-type'], name=args['--name'], tag_source=args['--tag-source'], site_tags=args['--site-tag'], alternate_domains=args['--alternate-domain'], ignore_inactive=args['--ignore-inactive'], force_inactive=args['--force-inactive'], ignore_redirected=args['--ignore-redirected']) # --add-site-tags elif args['--add-site-tags'] is True: configure_logging( 'site.add-site-tags', console_level=args['--console-log-level'], file_level='WARNING') if args['--name'] is not None: site_identity = args['--name'] else: site_identity = args['--domain'] msite = qquery_msite( session, name=args['--name'], domain=args['--domain']) if msite is None: logger.warning('Site %s does not exist!', site_identity) else: cls.add_site_tags(session, msite, args['--tag-source'], args['--site-tag']) # --replace-site-tags elif args['--replace-site-tags'] is True: configure_logging( 'site.repalce-site-tags', console_level=args['--console-log-level'], file_level='WARNING') if args['--name'] is not None: site_identity = args['--name'] else: site_identity = args['--domain'] msite = qquery_msite( session, name=args['--name'], domain=args['--domain']) if msite is None: logger.warning('Site %s does not exist!', site_identity) else: cls.replace_site_tags(session, msite, args['--tag-source'], args['--site-tag']) # --add-alternate-domains elif args['--add-alternate-domains'] is True: configure_logging( 'site.add-alternate-domains', console_level=args['--console-log-level'], file_level='WARNING') if args['--name'] is not None: site_identity = args['--name'] else: site_identity = args['--domain'] msite = qquery_msite( session, name=args['--name'], domain=args['--domain']) if msite is None: logger.warning('Site %s does not exist!', site_identity) else: cls.add_alternate_domains(session, msite, args['--alternate-domain']) # --replace-alternate-domains elif args['--replace-alternate-domains'] is True: configure_logging( 'site.replace-alternate-domains', console_level=args['--console-log-level'], file_level='WARNING') if args['--name'] is not None: site_identity = args['--name'] else: site_identity = args['--domain'] msite = qquery_msite( session, name=args['--name'], domain=args['--domain']) if msite is None: logger.warning('Site %s does not exist!', site_identity) else: cls.replace_alternate_domains(session, msite, args['--alternate-domain']) elif args['--disable'] is True: configure_logging( 'site.disable', console_level=args['--console-log-level'], file_level='WARNING') if args['--name'] is not None: site_identity = args['--name'] else: site_identity = args['--domain'] msite = qquery_msite( session, name=args['--name'], domain=args['--domain']) if msite is None: logger.warning('Site %s does not exist!', site_identity) else: cls.disable_site(session, msite) elif args['--enable'] is True: configure_logging( 'site.enable', console_level=args['--console-log-level'], file_level='WARNING') if args['--name'] is not None: site_identity = args['--name'] else: site_identity = args['--domain'] msite = qquery_msite( session, name=args['--name'], domain=args['--domain']) if msite is None: logger.warning('Site %s does not exist!', site_identity) else: cls.enable_site(session, msite) # --status elif args['--status'] is True: configure_logging( 'site.status', console_level=args['--console-log-level'], file_level='WARNING') if args['--include-disabled'] is True: cls.site_status(session, True) else: cls.site_status(session, False) # --dump elif args['--dump'] is True: configure_logging( 'site.status', console_level=args['--console-log-level'], file_level='INFO') cls.dump(session, args['<file>']) session.close()