def site_status(cls, session, include_disabled=False): """Check the status of all sites in the database.""" inactive = [] redirected = [] stable = [] if include_disabled is True: q = 'SELECT id, domain FROM site ORDER BY domain' else: q = 'SELECT id, domain FROM site WHERE is_enabled IS TRUE'\ + ' ORDER BY domain' for sid, domain in session.execute(q).fetchall(): base_url = infer_base_url(domain) if base_url is None: inactive.append(dict(id=sid, domain=domain)) else: if owns_url(domain, base_url) is True: stable.append( dict(id=sid, domain=domain, base_url=base_url)) else: redirected.append( dict(id=sid, domain=domain, base_url=base_url)) logger.info('Stable sites: %s', pprint.pformat(stable)) for o in stable: session.query(Site).filter_by(id=o['id'])\ .update(dict(base_url=o['base_url']), synchronize_session=False) session.commit() logger.info('Inactive sites: %s', pprint.pformat(inactive)) logger.info('Redirected sites: %s', pprint.pformat(redirected))
def parse_site(site): """Fill the optional fields of a site dict. Parameters ---------- site : dict Returns ------- tuple (site, status) """ status = 'ok' for k in REQ_FIELDS: if k not in site: status = 'invalid' break if 'base_url' not in site: if isinstance(site, str): print(site) if site.get('is_alive', True) is False: site['is_alive'] = False site['base_url'] = 'http://' + site['domain'] + '/' status = 'inactive' else: base_url = infer_base_url(site['domain']) if base_url is None: logger.warning('Domain %s is inactive!', site['domain']) site['base_url'] = 'http://' + site['domain'] + '/' site['is_alive'] = False status = 'inactive' else: if owns_url(site['domain'], base_url): site['base_url'] = base_url site['is_alive'] = True else: status = 'redirected' site['base_url'] = base_url site['is_alive'] = True fill_rules(site) return (site, status)
def parse_domain(line, site_type): """Validate and parse the domain represented in the line. Parameters ---------- line : string site_type : {'claim', 'fact_checking'} Returns ------- tuple (site, status) """ d = line.lower().strip() if DOMAIN_RE.match(d) is None: return (None, 'invalid') if d.startswith('www.'): d = d[4:] if len(d) <= 3: return (None, 'invalid') base_url = infer_base_url(d) if base_url is None: site = dict( name=d, domain=d, base_url='http://' + d + '/', site_type=site_type, is_alive=False) fill_rules(site) return (site, 'inactive') elif owns_url(d, base_url): site = dict(name=d, domain=d, base_url=base_url, site_type=site_type) fill_rules(site) return (site, 'ok') else: site = dict(name=d, domain=d, base_url=base_url, site_type=site_type) fill_rules(site) return (site, 'redirected')