def find_good_pometa(): category_name = u"Категория:Шаблоны_помет" categories = [get_category(category_name)] categories += get_subcategories(category_name) pometa_list = list() for category in categories: # print category.title() for article in category.articles(): m = re.match(u'Шаблон:(.*\.)', article.title()) if m: pometa = m.group(1) pometa_list.append(pometa) print pometa return pometa_list
def analyze_category(title): queue = [title] while queue: title = queue.pop() print u"→", title if re.search(r'[\\"?*|<>]', title): print " ×", title, "—", "bad symbols in title" continue if title in processed_titles: print " ×", title, "—", "already used" continue lang_skipping = re.match(u".*(/[-a-z]{2,8}|по языкам)$", title) # if lang_skipping: # print ' ×', title, '—', 'lang: skipping' # continue processed_titles.append(title) file_title = title.replace("/", "#").replace(":", "%") if lang_skipping: dirname = lang_skipping.group(1).replace("/", "") skip_path = join(categories_path, "#", dirname) if not exists(skip_path): os.mkdir(skip_path) filename = join(skip_path, file_title) else: filename = join(categories_path, file_title) # complete_filename = join(categories_path, 'complete', file_title) # blocked_filename = join(categories_path, 'blocked', file_title) # if exists(complete_filename): # # print u' ×', 'already exist' # return # if exists(blocked_filename): # print u' ×', title, '—', 'blocked' # return if exists(filename): # print u' ←', 'exist, reading' base_titles, sub_titles = read_titles(filename) else: category = get_category("%s%s" % (category_prefix, title)) base_titles = process_categories(category.categories()) sub_titles = process_categories(category.subcategories()) base_content = "\n".join(map(lambda x: "< %s" % x, base_titles)) or "-" sub_content = "\n".join(map(lambda x: "> %s" % x, sub_titles)) or "-" content = "%s\n\n%s\n" % (base_content, sub_content) save_file(filename, content, encode="utf-8") print u" +", title, "—", "saved" for sub_title in base_titles + sub_titles: if sub_title not in processed_titles: queue.append(sub_title)