Пример #1
0
def load_nl_refetch_interval_mapping():
    rules = load_rules()

    ret = {}

    for ruleset in rules:
        interval = ruleset['rewalk_interval_days']
        if ruleset['rewalk_disabled']:
            interval = 0

        if ruleset['netlocs']:
            for nl in ruleset['netlocs']:
                ret[nl] = interval

    return ret
Пример #2
0
def get_random_url_group(num_items):
	dat = g.session.execute('''SELECT url FROM web_pages TABLESAMPLE SYSTEM(:percentage);''', {'percentage' : num_items})
	dat = list(dat)

	ruleset = rules.load_rules(override=True)

	ret = []
	for linkurl, in dat:
		nl = urllib.parse.urlparse(linkurl).netloc

		badwords, badcompounds = getBadWords(ruleset, nl)
		filtered = isFiltered(linkurl, badwords, badcompounds)

		ret.append((linkurl, filtered))


	return ret
Пример #3
0
 def get_urls(self):
     self.rules = load_rules()
     feeds = []
     for item in self.rules:
         feeds += item['feedurls']
     return feeds